diff --git a/CHANGELOG.md b/CHANGELOG.md index 00d70621ebe..51c929631c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai ### Changes +- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#70424) Thanks @jlapenna. - Control UI: refine the agent Tool Access panel with compact live-tool chips, collapsible tool groups, direct per-tool toggles, and clearer runtime/source provenance. (#71405) Thanks @BunsDev. diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 0880b163454..98305dbef98 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -896,6 +896,7 @@ Notes: - `otel.sampleRate`: trace sampling rate `0`–`1`. - `otel.flushIntervalMs`: periodic telemetry flush interval in ms. - `otel.captureContent`: opt-in raw content capture for OTEL span attributes. Defaults to off. Boolean `true` captures non-system message/tool content; the object form lets you enable `inputMessages`, `outputMessages`, `toolInputs`, `toolOutputs`, and `systemPrompt` explicitly. +- `OPENCLAW_OTEL_PRELOADED=1`: environment toggle for hosts that already registered a global OpenTelemetry SDK. OpenClaw then skips plugin-owned SDK startup/shutdown while keeping diagnostic listeners active. - `cacheTrace.enabled`: log cache trace snapshots for embedded runs (default: `false`). - `cacheTrace.filePath`: output path for cache trace JSONL (default: `$OPENCLAW_STATE_DIR/logs/cache-trace.jsonl`). - `cacheTrace.includeMessages` / `includePrompt` / `includeSystem`: control what is included in cache trace output (all default: `true`). diff --git a/docs/logging.md b/docs/logging.md index d9bc63f431e..210b3054137 100644 --- a/docs/logging.md +++ b/docs/logging.md @@ -307,6 +307,10 @@ Notes: - Set `headers` when your collector requires auth. - Environment variables supported: `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_SERVICE_NAME`, `OTEL_EXPORTER_OTLP_PROTOCOL`. +- Set `OPENCLAW_OTEL_PRELOADED=1` when another preload or host process already + registered the global OpenTelemetry SDK. In that mode the plugin does not start + or shut down its own SDK, but it still wires OpenClaw diagnostic listeners and + honors `diagnostics.otel.traces`, `metrics`, and `logs`. ### Exported metrics (names + types) @@ -389,6 +393,8 @@ classes you opted into. `OTEL_EXPORTER_OTLP_ENDPOINT`. - If the endpoint already contains `/v1/traces` or `/v1/metrics`, it is used as-is. - If the endpoint already contains `/v1/logs`, it is used as-is for logs. +- `OPENCLAW_OTEL_PRELOADED=1` reuses an externally registered OpenTelemetry SDK + for traces/metrics instead of starting a plugin-owned NodeSDK. - `diagnostics.otel.logs` enables OTLP log export for the main logger output. ### Log export behavior diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index 0cda7fe9df3..a11f969ca0c 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -1,4 +1,4 @@ -import { beforeEach, describe, expect, test, vi } from "vitest"; +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; const telemetryState = vi.hoisted(() => { const counters = new Map }>(); @@ -125,6 +125,7 @@ const GRANDCHILD_SPAN_ID = "2222222222222222"; const PROTO_KEY = "__proto__"; const MAX_TEST_OTEL_CONTENT_ATTRIBUTE_CHARS = 4096; const OTEL_TRUNCATED_SUFFIX_MAX_CHARS = 20; +const ORIGINAL_OPENCLAW_OTEL_PRELOADED = process.env.OPENCLAW_OTEL_PRELOADED; function createLogger() { return { @@ -194,6 +195,7 @@ function flushDiagnosticEvents() { describe("diagnostics-otel service", () => { beforeEach(() => { + delete process.env.OPENCLAW_OTEL_PRELOADED; telemetryState.counters.clear(); telemetryState.histograms.clear(); telemetryState.spans.length = 0; @@ -208,6 +210,14 @@ describe("diagnostics-otel service", () => { traceExporterCtor.mockClear(); }); + afterEach(() => { + if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) { + delete process.env.OPENCLAW_OTEL_PRELOADED; + } else { + process.env.OPENCLAW_OTEL_PRELOADED = ORIGINAL_OPENCLAW_OTEL_PRELOADED; + } + }); + test("records message-flow metrics and spans", async () => { const service = createDiagnosticsOtelService(); const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true }); @@ -318,6 +328,84 @@ describe("diagnostics-otel service", () => { expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled(); }); + test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => { + process.env.OPENCLAW_OTEL_PRELOADED = "1"; + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true }); + await service.start(ctx); + + expect(sdkStart).not.toHaveBeenCalled(); + expect(traceExporterCtor).not.toHaveBeenCalled(); + expect(ctx.logger.info).toHaveBeenCalledWith( + "diagnostics-otel: using preloaded OpenTelemetry SDK", + ); + + emitDiagnosticEvent({ + type: "run.completed", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + outcome: "completed", + durationMs: 100, + }); + emitDiagnosticEvent({ + type: "log.record", + level: "INFO", + message: "preloaded log", + }); + await flushDiagnosticEvents(); + + expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith( + 100, + expect.objectContaining({ + "openclaw.provider": "openai", + "openclaw.model": "gpt-5.4", + }), + ); + expect(telemetryState.tracer.startSpan).toHaveBeenCalledWith( + "openclaw.run", + expect.objectContaining({ + attributes: expect.objectContaining({ + "openclaw.outcome": "completed", + }), + }), + undefined, + ); + expect(logEmit).toHaveBeenCalled(); + + await service.stop?.(ctx); + expect(sdkShutdown).not.toHaveBeenCalled(); + expect(logShutdown).toHaveBeenCalledTimes(1); + }); + + test("honors disabled traces when an OpenTelemetry SDK is preloaded", async () => { + process.env.OPENCLAW_OTEL_PRELOADED = "1"; + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: false, metrics: true }); + await service.start(ctx); + + emitDiagnosticEvent({ + type: "run.completed", + runId: "run-1", + provider: "openai", + model: "gpt-5.4", + outcome: "completed", + durationMs: 100, + }); + + expect(sdkStart).not.toHaveBeenCalled(); + expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith( + 100, + expect.objectContaining({ + "openclaw.provider": "openai", + }), + ); + expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled(); + + await service.stop?.(ctx); + expect(sdkShutdown).not.toHaveBeenCalled(); + }); + test("tears down active handles when restarted with diagnostics disabled", async () => { const service = createDiagnosticsOtelService(); const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, { diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 876e2388977..8f08472b07d 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -49,6 +49,7 @@ const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000; const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u; const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u; const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]); +const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED"; type OtelContentCapturePolicy = { inputMessages: boolean; @@ -164,6 +165,10 @@ function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy { }; } +function hasPreloadedOtelSdk(): boolean { + return process.env[PRELOADED_OTEL_SDK_ENV] === "1"; +} + function normalizeOtelContentValue(value: unknown): string | undefined { if (typeof value === "string") { return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS); @@ -400,38 +405,39 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { if (!tracesEnabled && !metricsEnabled && !logsEnabled) { return; } + const sdkPreloaded = hasPreloadedOtelSdk(); const resource = resourceFromAttributes({ [ATTR_SERVICE_NAME]: serviceName, }); - const traceUrl = resolveOtelUrl(endpoint, "v1/traces"); - const metricUrl = resolveOtelUrl(endpoint, "v1/metrics"); const logUrl = resolveOtelUrl(endpoint, "v1/logs"); - const traceExporter = tracesEnabled - ? new OTLPTraceExporter({ - ...(traceUrl ? { url: traceUrl } : {}), - ...(headers ? { headers } : {}), - }) - : undefined; + if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) { + const traceUrl = resolveOtelUrl(endpoint, "v1/traces"); + const metricUrl = resolveOtelUrl(endpoint, "v1/metrics"); + const traceExporter = tracesEnabled + ? new OTLPTraceExporter({ + ...(traceUrl ? { url: traceUrl } : {}), + ...(headers ? { headers } : {}), + }) + : undefined; - const metricExporter = metricsEnabled - ? new OTLPMetricExporter({ - ...(metricUrl ? { url: metricUrl } : {}), - ...(headers ? { headers } : {}), - }) - : undefined; + const metricExporter = metricsEnabled + ? new OTLPMetricExporter({ + ...(metricUrl ? { url: metricUrl } : {}), + ...(headers ? { headers } : {}), + }) + : undefined; - const metricReader = metricExporter - ? new PeriodicExportingMetricReader({ - exporter: metricExporter, - ...(typeof otel.flushIntervalMs === "number" - ? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) } - : {}), - }) - : undefined; + const metricReader = metricExporter + ? new PeriodicExportingMetricReader({ + exporter: metricExporter, + ...(typeof otel.flushIntervalMs === "number" + ? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) } + : {}), + }) + : undefined; - if (tracesEnabled || metricsEnabled) { sdk = new NodeSDK({ resource, ...(traceExporter ? { traceExporter } : {}), @@ -452,6 +458,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`); throw err; } + } else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) { + ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK"); } const logSeverityMap: Record = {