diff --git a/CHANGELOG.md b/CHANGELOG.md index 406a77bed70..71d4609015e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ Docs: https://docs.openclaw.ai - Plugins: migrate the local plugin registry automatically during package install/update, keeping install metadata in the plugin index while indexing existing plugin manifests for the new cold registry path. Thanks @vincentkoc and @shakkernerd. - Plugins/doctor: make `openclaw doctor --fix` refresh the plugin index and cold registry index when needed without treating plugin install records as authored config. Thanks @vincentkoc and @shakkernerd. - Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc. +- Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc. +- Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc. - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc. - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc. - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc. diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index d4414089c82..d716da5c85b 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -15a3740b57d0c95f0c0963c1d0eff6d85ecdb8cb03960b4763e847f8a24551c0 config-baseline.json -3c39a3a2008ce938886b600e9429a71921c1f9b00c64a16801f47d6d8d2ad7a8 config-baseline.core.json +211e9d4cdb309e7fe0c1ed91d060201240a9287f8c5cb3c893aba3f904a20d30 config-baseline.json +ffda2d2911adc03148a368f3b40b17cbdcb7af0066bccdc555e8d596cdea8cda config-baseline.core.json 7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json 9e131d7734f8b9cc9e7f8af6cc6b6dc81c9971dc551fadbe66fb0d682173f32d config-baseline.plugin.json diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 3628b84a096..26a930ec22f 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -869,6 +869,9 @@ Notes: otel: { enabled: false, endpoint: "https://otel-collector.example.com:4318", + tracesEndpoint: "https://traces.example.com/v1/traces", + metricsEndpoint: "https://metrics.example.com/v1/metrics", + logsEndpoint: "https://logs.example.com/v1/logs", protocol: "http/protobuf", // http/protobuf | grpc headers: { "x-tenant-id": "my-org" }, serviceName: "openclaw-gateway", @@ -903,6 +906,7 @@ Notes: - `stuckSessionWarnMs`: age threshold in ms for emitting stuck-session warnings while a session remains in processing state. - `otel.enabled`: enables the OpenTelemetry export pipeline (default: `false`). For the full configuration, signal catalog, and privacy model, see [OpenTelemetry export](/gateway/opentelemetry). - `otel.endpoint`: collector URL for OTel export. +- `otel.tracesEndpoint` / `otel.metricsEndpoint` / `otel.logsEndpoint`: optional signal-specific OTLP endpoints. When set, they override `otel.endpoint` for that signal only. - `otel.protocol`: `"http/protobuf"` (default) or `"grpc"`. - `otel.headers`: extra HTTP/gRPC metadata headers sent with OTel export requests. - `otel.serviceName`: service name for resource attributes. @@ -912,6 +916,7 @@ Notes: - `otel.captureContent`: opt-in raw content capture for OTEL span attributes. Defaults to off. Boolean `true` captures non-system message/tool content; the object form lets you enable `inputMessages`, `outputMessages`, `toolInputs`, `toolOutputs`, and `systemPrompt` explicitly. - `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`: environment toggle for latest experimental GenAI span provider attributes. By default spans keep the legacy `gen_ai.system` attribute for compatibility; GenAI metrics use bounded semantic attributes. - `OPENCLAW_OTEL_PRELOADED=1`: environment toggle for hosts that already registered a global OpenTelemetry SDK. OpenClaw then skips plugin-owned SDK startup/shutdown while keeping diagnostic listeners active. +- `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT`, and `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`: signal-specific endpoint env vars used when the matching config key is unset. - `cacheTrace.enabled`: log cache trace snapshots for embedded runs (default: `false`). - `cacheTrace.filePath`: output path for cache trace JSONL (default: `$OPENCLAW_STATE_DIR/logs/cache-trace.jsonl`). - `cacheTrace.includeMessages` / `includePrompt` / `includeSystem`: control what is included in cache trace output (all default: `true`). diff --git a/docs/gateway/opentelemetry.md b/docs/gateway/opentelemetry.md index 697c15216e5..fee7172ca81 100644 --- a/docs/gateway/opentelemetry.md +++ b/docs/gateway/opentelemetry.md @@ -79,6 +79,9 @@ when `diagnostics.otel.enabled` is true. otel: { enabled: true, endpoint: "http://otel-collector:4318", + tracesEndpoint: "http://otel-collector:4318/v1/traces", + metricsEndpoint: "http://otel-collector:4318/v1/metrics", + logsEndpoint: "http://otel-collector:4318/v1/logs", protocol: "http/protobuf", // grpc is ignored serviceName: "openclaw-gateway", headers: { "x-collector-token": "..." }, @@ -102,13 +105,14 @@ when `diagnostics.otel.enabled` is true. ### Environment variables -| Variable | Purpose | -| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `OTEL_EXPORTER_OTLP_ENDPOINT` | Override `diagnostics.otel.endpoint`. If the value already contains `/v1/traces`, `/v1/metrics`, or `/v1/logs`, it is used as-is. | -| `OTEL_SERVICE_NAME` | Override `diagnostics.otel.serviceName`. | -| `OTEL_EXPORTER_OTLP_PROTOCOL` | Override the wire protocol (only `http/protobuf` is honored today). | -| `OTEL_SEMCONV_STABILITY_OPT_IN` | Set to `gen_ai_latest_experimental` to emit the latest experimental GenAI span attribute (`gen_ai.provider.name`) instead of the legacy `gen_ai.system`. GenAI metrics always use bounded, low-cardinality semantic attributes regardless. | -| `OPENCLAW_OTEL_PRELOADED` | Set to `1` when another preload or host process already registered the global OpenTelemetry SDK. The plugin then skips its own NodeSDK lifecycle but still wires diagnostic listeners and honors `traces`/`metrics`/`logs`. | +| Variable | Purpose | +| ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | Override `diagnostics.otel.endpoint`. If the value already contains `/v1/traces`, `/v1/metrics`, or `/v1/logs`, it is used as-is. | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` / `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` / `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | Signal-specific endpoint overrides used when the matching `diagnostics.otel.*Endpoint` config key is unset. Signal-specific config wins over signal-specific env, which wins over the shared endpoint. | +| `OTEL_SERVICE_NAME` | Override `diagnostics.otel.serviceName`. | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | Override the wire protocol (only `http/protobuf` is honored today). | +| `OTEL_SEMCONV_STABILITY_OPT_IN` | Set to `gen_ai_latest_experimental` to emit the latest experimental GenAI span attribute (`gen_ai.provider.name`) instead of the legacy `gen_ai.system`. GenAI metrics always use bounded, low-cardinality semantic attributes regardless. | +| `OPENCLAW_OTEL_PRELOADED` | Set to `1` when another preload or host process already registered the global OpenTelemetry SDK. The plugin then skips its own NodeSDK lifecycle but still wires diagnostic listeners and honors `traces`/`metrics`/`logs`. | ## Privacy and content capture diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index 5f7a26fe7b6..2f2823ba00f 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -41,6 +41,8 @@ const sdkShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined)); const logEmit = vi.hoisted(() => vi.fn()); const logShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined)); const traceExporterCtor = vi.hoisted(() => vi.fn()); +const metricExporterCtor = vi.hoisted(() => vi.fn()); +const logExporterCtor = vi.hoisted(() => vi.fn()); vi.mock("@opentelemetry/api", () => ({ context: { @@ -70,7 +72,9 @@ vi.mock("@opentelemetry/sdk-node", () => ({ })); vi.mock("@opentelemetry/exporter-metrics-otlp-proto", () => ({ - OTLPMetricExporter: function OTLPMetricExporter() {}, + OTLPMetricExporter: function OTLPMetricExporter(options?: unknown) { + metricExporterCtor(options); + }, })); vi.mock("@opentelemetry/exporter-trace-otlp-proto", () => ({ @@ -80,7 +84,9 @@ vi.mock("@opentelemetry/exporter-trace-otlp-proto", () => ({ })); vi.mock("@opentelemetry/exporter-logs-otlp-proto", () => ({ - OTLPLogExporter: function OTLPLogExporter() {}, + OTLPLogExporter: function OTLPLogExporter(options?: unknown) { + logExporterCtor(options); + }, })); vi.mock("@opentelemetry/sdk-logs", () => ({ @@ -133,6 +139,10 @@ const PROTO_KEY = "__proto__"; const MAX_TEST_OTEL_CONTENT_ATTRIBUTE_CHARS = 4096; const OTEL_TRUNCATED_SUFFIX_MAX_CHARS = 20; const ORIGINAL_OPENCLAW_OTEL_PRELOADED = process.env.OPENCLAW_OTEL_PRELOADED; +const ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; +const ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = + process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; +const ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; const ORIGINAL_OTEL_SEMCONV_STABILITY_OPT_IN = process.env.OTEL_SEMCONV_STABILITY_OPT_IN; function createLogger() { @@ -173,7 +183,10 @@ function createOtelContext( }, logger: createLogger(), stateDir: OTEL_TEST_STATE_DIR, - internalDiagnostics: { onEvent: onInternalDiagnosticEvent }, + internalDiagnostics: { + emit: emitTrustedDiagnosticEvent, + onEvent: onInternalDiagnosticEvent, + }, }; } @@ -220,6 +233,11 @@ describe("diagnostics-otel service", () => { logEmit.mockReset(); logShutdown.mockClear(); traceExporterCtor.mockClear(); + metricExporterCtor.mockClear(); + logExporterCtor.mockClear(); + delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; + delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; + delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; }); afterEach(() => { @@ -233,6 +251,22 @@ describe("diagnostics-otel service", () => { } else { process.env.OTEL_SEMCONV_STABILITY_OPT_IN = ORIGINAL_OTEL_SEMCONV_STABILITY_OPT_IN; } + if (ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT === undefined) { + delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; + } else { + process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; + } + if (ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT === undefined) { + delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; + } else { + process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = + ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; + } + if (ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT === undefined) { + delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; + } else { + process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; + } }); test("records message-flow metrics and spans", async () => { @@ -395,6 +429,124 @@ describe("diagnostics-otel service", () => { expect(logShutdown).toHaveBeenCalledTimes(1); }); + test("emits and records bounded telemetry exporter health events", async () => { + const events: Array[0]>[0]> = []; + const unsubscribe = onInternalDiagnosticEvent((event) => { + if (event.type === "telemetry.exporter") { + events.push(event); + } + }); + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true }); + + await service.start(ctx); + + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + type: "telemetry.exporter", + exporter: "diagnostics-otel", + signal: "traces", + status: "started", + reason: "configured", + }), + expect.objectContaining({ + type: "telemetry.exporter", + exporter: "diagnostics-otel", + signal: "metrics", + status: "started", + reason: "configured", + }), + expect.objectContaining({ + type: "telemetry.exporter", + exporter: "diagnostics-otel", + signal: "logs", + status: "started", + reason: "configured", + }), + ]), + ); + expect( + telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add, + ).toHaveBeenCalledWith(1, { + "openclaw.exporter": "diagnostics-otel", + "openclaw.signal": "logs", + "openclaw.status": "started", + "openclaw.reason": "configured", + }); + + unsubscribe(); + await service.stop?.(ctx); + }); + + test("reports log exporter emit failures without exporting raw error text", async () => { + const events: Array[0]>[0]> = []; + const unsubscribe = onInternalDiagnosticEvent((event) => { + if (event.type === "telemetry.exporter") { + events.push(event); + } + }); + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { logs: true }); + logEmit.mockImplementationOnce(() => { + throw new TypeError("token sk-test-secret should not leave as telemetry"); + }); + + await service.start(ctx); + emitDiagnosticEvent({ + type: "log.record", + level: "INFO", + message: "export me", + }); + await flushDiagnosticEvents(); + + expect(events).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + type: "telemetry.exporter", + exporter: "diagnostics-otel", + signal: "logs", + status: "failure", + reason: "emit_failed", + errorCategory: "TypeError", + }), + ]), + ); + expect( + telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add, + ).toHaveBeenCalledWith(1, { + "openclaw.exporter": "diagnostics-otel", + "openclaw.signal": "logs", + "openclaw.status": "failure", + "openclaw.reason": "emit_failed", + "openclaw.errorCategory": "TypeError", + }); + + unsubscribe(); + await service.stop?.(ctx); + }); + + test("ignores untrusted telemetry exporter events for OTEL metrics", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true }); + + await service.start(ctx); + telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add.mockClear(); + emitDiagnosticEvent({ + type: "telemetry.exporter", + exporter: "spoofed-plugin-exporter", + signal: "metrics", + status: "failure", + reason: "emit_failed", + }); + + expect( + telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add, + ).not.toHaveBeenCalled(); + + await service.stop?.(ctx); + }); + test("honors disabled traces when an OpenTelemetry SDK is preloaded", async () => { process.env.OPENCLAW_OTEL_PRELOADED = "1"; const service = createDiagnosticsOtelService(); @@ -489,6 +641,50 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("uses signal-specific OTLP endpoints ahead of the shared endpoint", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { + traces: true, + metrics: true, + logs: true, + }); + ctx.config.diagnostics!.otel!.tracesEndpoint = "https://trace.example.com/otlp"; + ctx.config.diagnostics!.otel!.metricsEndpoint = "https://metric.example.com/v1/metrics"; + ctx.config.diagnostics!.otel!.logsEndpoint = "https://log.example.com/otlp"; + + await service.start(ctx); + + const traceOptions = traceExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + const metricOptions = metricExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + const logOptions = logExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + expect(traceOptions?.url).toBe("https://trace.example.com/otlp/v1/traces"); + expect(metricOptions?.url).toBe("https://metric.example.com/v1/metrics"); + expect(logOptions?.url).toBe("https://log.example.com/otlp/v1/logs"); + await service.stop?.(ctx); + }); + + test("uses signal-specific OTLP env endpoints when config is unset", async () => { + process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = "https://trace-env.example.com/v1/traces"; + process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = "https://metric-env.example.com/otlp"; + process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = "https://log-env.example.com/otlp"; + + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { + traces: true, + metrics: true, + logs: true, + }); + await service.start(ctx); + + const traceOptions = traceExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + const metricOptions = metricExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + const logOptions = logExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined; + expect(traceOptions?.url).toBe("https://trace-env.example.com/v1/traces"); + expect(metricOptions?.url).toBe("https://metric-env.example.com/otlp/v1/metrics"); + expect(logOptions?.url).toBe("https://log-env.example.com/otlp/v1/logs"); + await service.stop?.(ctx); + }); + test("redacts sensitive data from log messages before export", async () => { const emitCall = await emitAndCaptureLog({ level: "INFO", diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 6176956b064..41c193faf57 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -50,6 +50,10 @@ const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u; const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u; const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]); const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED"; +const OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT"; +const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"; +const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"; +const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT"; const OTEL_SEMCONV_STABILITY_OPT_IN_ENV = "OTEL_SEMCONV_STABILITY_OPT_IN"; const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental"; const GEN_AI_TOKEN_USAGE_BUCKETS = [ @@ -77,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract< DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" } >; +type TelemetryExporterDiagnosticEvent = Extract< + DiagnosticEventPayload, + { type: "telemetry.exporter" } +>; const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = { inputMessages: false, @@ -102,6 +110,18 @@ function resolveOtelUrl(endpoint: string | undefined, path: string): string | un return `${endpoint}/${path}`; } +function resolveSignalOtelUrl(params: { + signalEndpoint?: string; + signalEnvEndpoint?: string; + endpoint?: string; + path: string; +}): string | undefined { + return resolveOtelUrl( + normalizeEndpoint(params.signalEndpoint ?? params.signalEnvEndpoint) ?? params.endpoint, + params.path, + ); +} + function resolveSampleRate(value: number | undefined): number | undefined { if (typeof value !== "number" || !Number.isFinite(value)) { return undefined; @@ -126,6 +146,17 @@ function formatError(err: unknown): string { } } +function errorCategory(err: unknown): string { + try { + if (err instanceof Error && typeof err.name === "string" && err.name.trim()) { + return lowCardinalityAttr(err.name, "Error"); + } + return lowCardinalityAttr(typeof err, "unknown"); + } catch { + return "unknown"; + } +} + function redactOtelAttributes(attributes: Record) { const redactedAttributes: Record = {}; for (const [key, value] of Object.entries(attributes)) { @@ -513,35 +544,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { return; } + const emitExporterEvent = ( + event: Omit, + ) => { + try { + ctx.internalDiagnostics?.emit({ + type: "telemetry.exporter", + ...event, + }); + } catch { + // Exporter health must never affect the exporter lifecycle. + } + }; + const emitForSignals = ( + signals: TelemetryExporterDiagnosticEvent["signal"][], + event: Omit, + ) => { + for (const signal of signals) { + emitExporterEvent({ signal, ...event }); + } + }; + const tracesEnabled = otel.traces !== false; + const metricsEnabled = otel.metrics !== false; + const logsEnabled = otel.logs === true; + const enabledSignals: TelemetryExporterDiagnosticEvent["signal"][] = [ + ...(tracesEnabled ? (["traces"] as const) : []), + ...(metricsEnabled ? (["metrics"] as const) : []), + ...(logsEnabled ? (["logs"] as const) : []), + ]; + if (enabledSignals.length === 0) { + return; + } + const protocol = otel.protocol ?? process.env.OTEL_EXPORTER_OTLP_PROTOCOL ?? "http/protobuf"; if (protocol !== "http/protobuf") { + emitForSignals(enabledSignals, { + exporter: "diagnostics-otel", + status: "failure", + reason: "unsupported_protocol", + }); ctx.logger.warn(`diagnostics-otel: unsupported protocol ${protocol}`); return; } - const endpoint = normalizeEndpoint(otel.endpoint ?? process.env.OTEL_EXPORTER_OTLP_ENDPOINT); + const endpoint = normalizeEndpoint( + otel.endpoint ?? process.env[OTEL_EXPORTER_OTLP_ENDPOINT_ENV], + ); const headers = otel.headers ?? undefined; const serviceName = otel.serviceName?.trim() || process.env.OTEL_SERVICE_NAME || DEFAULT_SERVICE_NAME; const sampleRate = resolveSampleRate(otel.sampleRate); const contentCapturePolicy = resolveContentCapturePolicy(otel.captureContent); - - const tracesEnabled = otel.traces !== false; - const metricsEnabled = otel.metrics !== false; - const logsEnabled = otel.logs === true; - if (!tracesEnabled && !metricsEnabled && !logsEnabled) { - return; - } const sdkPreloaded = hasPreloadedOtelSdk(); const resource = resourceFromAttributes({ [ATTR_SERVICE_NAME]: serviceName, }); - const logUrl = resolveOtelUrl(endpoint, "v1/logs"); + const logUrl = resolveSignalOtelUrl({ + signalEndpoint: otel.logsEndpoint, + signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV], + endpoint, + path: "v1/logs", + }); if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) { - const traceUrl = resolveOtelUrl(endpoint, "v1/traces"); - const metricUrl = resolveOtelUrl(endpoint, "v1/metrics"); + const traceUrl = resolveSignalOtelUrl({ + signalEndpoint: otel.tracesEndpoint, + signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV], + endpoint, + path: "v1/traces", + }); + const metricUrl = resolveSignalOtelUrl({ + signalEndpoint: otel.metricsEndpoint, + signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV], + endpoint, + path: "v1/metrics", + }); const traceExporter = tracesEnabled ? new OTLPTraceExporter({ ...(traceUrl ? { url: traceUrl } : {}), @@ -581,6 +659,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { try { sdk.start(); } catch (err) { + emitForSignals( + [ + ...(tracesEnabled ? (["traces"] as const) : []), + ...(metricsEnabled ? (["metrics"] as const) : []), + ], + { + exporter: "diagnostics-otel", + status: "failure", + reason: "start_failed", + errorCategory: errorCategory(err), + }, + ); await stopStarted(); ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`); throw err; @@ -750,6 +840,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { unit: "1", description: "Diagnostic memory pressure events", }); + const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", { + unit: "1", + description: "Diagnostic telemetry exporter lifecycle and failure events", + }); let recordLogRecord: | (( @@ -814,6 +908,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { } otelLogger.emit(logRecord); } catch (err) { + emitExporterEvent({ + exporter: "diagnostics-otel", + signal: "logs", + status: "failure", + reason: "emit_failed", + errorCategory: errorCategory(err), + }); const now = Date.now(); if ( now - logRecordExportFailureLastReportedAt >= @@ -1569,6 +1670,24 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" }); }; + const recordTelemetryExporter = ( + evt: TelemetryExporterDiagnosticEvent, + metadata: DiagnosticEventMetadata, + ) => { + if (!metadata.trusted) { + return; + } + telemetryExporterCounter.add(1, { + "openclaw.exporter": lowCardinalityAttr(evt.exporter, "unknown"), + "openclaw.signal": evt.signal, + "openclaw.status": evt.status, + ...(evt.reason ? { "openclaw.reason": evt.reason } : {}), + ...(evt.errorCategory + ? { "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other") } + : {}), + }); + }; + const subscribe = ctx.internalDiagnostics?.onEvent; if (!subscribe) { ctx.logger.error("diagnostics-otel: internal diagnostics capability unavailable"); @@ -1656,6 +1775,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "diagnostic.memory.pressure": recordMemoryPressure(evt); return; + case "telemetry.exporter": + recordTelemetryExporter(evt, metadata); + return; case "tool.execution.started": case "run.started": case "model.call.started": @@ -1669,6 +1791,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { } }); + emitForSignals(enabledSignals, { + exporter: "diagnostics-otel", + status: "started", + reason: "configured", + }); + if (logsEnabled) { ctx.logger.info("diagnostics-otel: logs exporter enabled (OTLP/Protobuf)"); } diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index f36a22a4f02..9fad1353469 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -166,6 +166,24 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { description: "Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.", }, + tracesEndpoint: { + type: "string", + title: "OpenTelemetry Traces Endpoint", + description: + "Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.", + }, + metricsEndpoint: { + type: "string", + title: "OpenTelemetry Metrics Endpoint", + description: + "Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.", + }, + logsEndpoint: { + type: "string", + title: "OpenTelemetry Logs Endpoint", + description: + "Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.", + }, protocol: { anyOf: [ { @@ -23453,6 +23471,21 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { help: "Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.", tags: ["observability"], }, + "diagnostics.otel.tracesEndpoint": { + label: "OpenTelemetry Traces Endpoint", + help: "Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.", + tags: ["observability"], + }, + "diagnostics.otel.metricsEndpoint": { + label: "OpenTelemetry Metrics Endpoint", + help: "Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.", + tags: ["observability"], + }, + "diagnostics.otel.logsEndpoint": { + label: "OpenTelemetry Logs Endpoint", + help: "Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.", + tags: ["observability"], + }, "diagnostics.otel.protocol": { label: "OpenTelemetry Protocol", help: 'OTel transport protocol for telemetry export: "http/protobuf" or "grpc" depending on collector support. Use the protocol your observability backend expects to avoid dropped telemetry payloads.', diff --git a/src/config/schema.help.quality.test.ts b/src/config/schema.help.quality.test.ts index b70235f69c1..80331de0200 100644 --- a/src/config/schema.help.quality.test.ts +++ b/src/config/schema.help.quality.test.ts @@ -534,10 +534,13 @@ const FINAL_BACKLOG_TARGET_KEYS = [ "diagnostics.otel.endpoint", "diagnostics.otel.flushIntervalMs", "diagnostics.otel.headers", + "diagnostics.otel.logsEndpoint", "diagnostics.otel.logs", + "diagnostics.otel.metricsEndpoint", "diagnostics.otel.metrics", "diagnostics.otel.sampleRate", "diagnostics.otel.serviceName", + "diagnostics.otel.tracesEndpoint", "diagnostics.otel.traces", "gateway.remote.password", "gateway.remote.token", diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 9220912cecd..6bae7757fc8 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -566,6 +566,12 @@ export const FIELD_HELP: Record = { "Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.", "diagnostics.otel.endpoint": "Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.", + "diagnostics.otel.tracesEndpoint": + "Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.", + "diagnostics.otel.metricsEndpoint": + "Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.", + "diagnostics.otel.logsEndpoint": + "Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.", "diagnostics.otel.protocol": 'OTel transport protocol for telemetry export: "http/protobuf" or "grpc" depending on collector support. Use the protocol your observability backend expects to avoid dropped telemetry payloads.', "diagnostics.otel.headers": diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 776e02502ef..fe5eb67d76c 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -40,6 +40,9 @@ export const FIELD_LABELS: Record = { "diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)", "diagnostics.otel.enabled": "OpenTelemetry Enabled", "diagnostics.otel.endpoint": "OpenTelemetry Endpoint", + "diagnostics.otel.tracesEndpoint": "OpenTelemetry Traces Endpoint", + "diagnostics.otel.metricsEndpoint": "OpenTelemetry Metrics Endpoint", + "diagnostics.otel.logsEndpoint": "OpenTelemetry Logs Endpoint", "diagnostics.otel.protocol": "OpenTelemetry Protocol", "diagnostics.otel.headers": "OpenTelemetry Headers", "diagnostics.otel.serviceName": "OpenTelemetry Service Name", diff --git a/src/config/types.base.ts b/src/config/types.base.ts index 86b2fd3b10a..eb58997a9da 100644 --- a/src/config/types.base.ts +++ b/src/config/types.base.ts @@ -234,6 +234,9 @@ export type LoggingConfig = { export type DiagnosticsOtelConfig = { enabled?: boolean; endpoint?: string; + tracesEndpoint?: string; + metricsEndpoint?: string; + logsEndpoint?: string; protocol?: "http/protobuf" | "grpc"; headers?: Record; serviceName?: string; diff --git a/src/config/zod-schema.ts b/src/config/zod-schema.ts index 345a20971cd..8eb41a1606a 100644 --- a/src/config/zod-schema.ts +++ b/src/config/zod-schema.ts @@ -306,6 +306,9 @@ export const OpenClawSchema = z .object({ enabled: z.boolean().optional(), endpoint: z.string().optional(), + tracesEndpoint: z.string().optional(), + metricsEndpoint: z.string().optional(), + logsEndpoint: z.string().optional(), protocol: z.union([z.literal("http/protobuf"), z.literal("grpc")]).optional(), headers: z.record(z.string(), z.string()).optional(), serviceName: z.string().optional(), diff --git a/src/infra/diagnostic-events.ts b/src/infra/diagnostic-events.ts index 9cc76ddb525..ba7f35f061e 100644 --- a/src/infra/diagnostic-events.ts +++ b/src/infra/diagnostic-events.ts @@ -352,6 +352,22 @@ export type DiagnosticLogRecordEvent = DiagnosticBaseEvent & { }; }; +export type DiagnosticTelemetryExporterEvent = DiagnosticBaseEvent & { + type: "telemetry.exporter"; + exporter: string; + signal: "traces" | "metrics" | "logs"; + status: "started" | "failure" | "dropped"; + reason?: + | "configured" + | "emit_failed" + | "handler_failed" + | "queue_full" + | "shutdown_failed" + | "start_failed" + | "unsupported_protocol"; + errorCategory?: string; +}; + export type DiagnosticEventPayload = | DiagnosticUsageEvent | DiagnosticWebhookReceivedEvent @@ -382,7 +398,8 @@ export type DiagnosticEventPayload = | DiagnosticMemorySampleEvent | DiagnosticMemoryPressureEvent | DiagnosticPayloadLargeEvent - | DiagnosticLogRecordEvent; + | DiagnosticLogRecordEvent + | DiagnosticTelemetryExporterEvent; export type DiagnosticEventInput = DiagnosticEventPayload extends infer Event ? Event extends DiagnosticEventPayload diff --git a/src/logging/diagnostic-stability.ts b/src/logging/diagnostic-stability.ts index 0707b6d41d1..50a2bbad2ca 100644 --- a/src/logging/diagnostic-stability.ts +++ b/src/logging/diagnostic-stability.ts @@ -345,6 +345,12 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi record.pluginId = event.pluginId; assignReasonCode(record, event.reason); break; + case "telemetry.exporter": + record.source = event.exporter; + record.target = event.signal; + record.outcome = event.status; + assignReasonCode(record, event.reason ?? event.errorCategory); + break; } return record; diff --git a/src/plugins/services.test.ts b/src/plugins/services.test.ts index 05975a77fcb..361a095a67d 100644 --- a/src/plugins/services.test.ts +++ b/src/plugins/services.test.ts @@ -189,6 +189,7 @@ describe("startPluginServices", () => { }); expect(contexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function"); + expect(contexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function"); const untrustedContexts: OpenClawPluginServiceContext[] = []; const untrustedService = createTrackingService("diagnostics-otel", { diff --git a/src/plugins/services.ts b/src/plugins/services.ts index 2cd873b2c0e..db5dd513572 100644 --- a/src/plugins/services.ts +++ b/src/plugins/services.ts @@ -1,6 +1,9 @@ import { STATE_DIR } from "../config/paths.js"; import type { OpenClawConfig } from "../config/types.openclaw.js"; -import { onInternalDiagnosticEvent } from "../infra/diagnostic-events.js"; +import { + emitTrustedDiagnosticEvent, + onInternalDiagnosticEvent, +} from "../infra/diagnostic-events.js"; import { createSubsystemLogger } from "../logging/subsystem.js"; import type { PluginServiceRegistration } from "./registry-types.js"; import type { PluginRegistry } from "./registry.js"; @@ -29,7 +32,12 @@ function createServiceContext(params: { ...(params.service?.origin === "bundled" && params.service.pluginId === "diagnostics-otel" && params.service.service.id === "diagnostics-otel" - ? { internalDiagnostics: { onEvent: onInternalDiagnosticEvent } } + ? { + internalDiagnostics: { + emit: emitTrustedDiagnosticEvent, + onEvent: onInternalDiagnosticEvent, + }, + } : {}), }; } diff --git a/src/plugins/types.ts b/src/plugins/types.ts index c8a24a0f2a5..57ca7103671 100644 --- a/src/plugins/types.ts +++ b/src/plugins/types.ts @@ -28,6 +28,7 @@ import type { GatewayRequestHandler } from "../gateway/server-methods/types.js"; import type { InternalHookHandler } from "../hooks/internal-hook-types.js"; import type { ImageGenerationProvider } from "../image-generation/types.js"; import type { + DiagnosticEventInput, DiagnosticEventMetadata, DiagnosticEventPayload, } from "../infra/diagnostic-events.js"; @@ -1976,6 +1977,7 @@ export type OpenClawPluginServiceContext = { stateDir: string; logger: PluginLogger; internalDiagnostics?: { + emit: (event: DiagnosticEventInput) => void; onEvent: ( listener: (event: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => void, ) => () => void;