feat(diagnostics-otel): add exporter health diagnostics

Adds diagnostics-otel exporter health events and signal-specific endpoint wiring, with docs and config schema coverage.
This commit is contained in:
Vincent Koc
2026-04-25 18:34:44 -07:00
committed by GitHub
parent 25ecb2895a
commit 2495585a32
17 changed files with 446 additions and 26 deletions

View File

@@ -26,6 +26,8 @@ Docs: https://docs.openclaw.ai
- Plugins: migrate the local plugin registry automatically during package install/update, keeping install metadata in the plugin index while indexing existing plugin manifests for the new cold registry path. Thanks @vincentkoc and @shakkernerd.
- Plugins/doctor: make `openclaw doctor --fix` refresh the plugin index and cold registry index when needed without treating plugin install records as authored config. Thanks @vincentkoc and @shakkernerd.
- Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc.
- Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc.
- Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
- Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
- Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
- Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.

View File

@@ -1,4 +1,4 @@
15a3740b57d0c95f0c0963c1d0eff6d85ecdb8cb03960b4763e847f8a24551c0 config-baseline.json
3c39a3a2008ce938886b600e9429a71921c1f9b00c64a16801f47d6d8d2ad7a8 config-baseline.core.json
211e9d4cdb309e7fe0c1ed91d060201240a9287f8c5cb3c893aba3f904a20d30 config-baseline.json
ffda2d2911adc03148a368f3b40b17cbdcb7af0066bccdc555e8d596cdea8cda config-baseline.core.json
7cd9c908f066c143eab2a201efbc9640f483ab28bba92ddeca1d18cc2b528bc3 config-baseline.channel.json
9e131d7734f8b9cc9e7f8af6cc6b6dc81c9971dc551fadbe66fb0d682173f32d config-baseline.plugin.json

View File

@@ -869,6 +869,9 @@ Notes:
otel: {
enabled: false,
endpoint: "https://otel-collector.example.com:4318",
tracesEndpoint: "https://traces.example.com/v1/traces",
metricsEndpoint: "https://metrics.example.com/v1/metrics",
logsEndpoint: "https://logs.example.com/v1/logs",
protocol: "http/protobuf", // http/protobuf | grpc
headers: { "x-tenant-id": "my-org" },
serviceName: "openclaw-gateway",
@@ -903,6 +906,7 @@ Notes:
- `stuckSessionWarnMs`: age threshold in ms for emitting stuck-session warnings while a session remains in processing state.
- `otel.enabled`: enables the OpenTelemetry export pipeline (default: `false`). For the full configuration, signal catalog, and privacy model, see [OpenTelemetry export](/gateway/opentelemetry).
- `otel.endpoint`: collector URL for OTel export.
- `otel.tracesEndpoint` / `otel.metricsEndpoint` / `otel.logsEndpoint`: optional signal-specific OTLP endpoints. When set, they override `otel.endpoint` for that signal only.
- `otel.protocol`: `"http/protobuf"` (default) or `"grpc"`.
- `otel.headers`: extra HTTP/gRPC metadata headers sent with OTel export requests.
- `otel.serviceName`: service name for resource attributes.
@@ -912,6 +916,7 @@ Notes:
- `otel.captureContent`: opt-in raw content capture for OTEL span attributes. Defaults to off. Boolean `true` captures non-system message/tool content; the object form lets you enable `inputMessages`, `outputMessages`, `toolInputs`, `toolOutputs`, and `systemPrompt` explicitly.
- `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`: environment toggle for latest experimental GenAI span provider attributes. By default spans keep the legacy `gen_ai.system` attribute for compatibility; GenAI metrics use bounded semantic attributes.
- `OPENCLAW_OTEL_PRELOADED=1`: environment toggle for hosts that already registered a global OpenTelemetry SDK. OpenClaw then skips plugin-owned SDK startup/shutdown while keeping diagnostic listeners active.
- `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT`, and `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`: signal-specific endpoint env vars used when the matching config key is unset.
- `cacheTrace.enabled`: log cache trace snapshots for embedded runs (default: `false`).
- `cacheTrace.filePath`: output path for cache trace JSONL (default: `$OPENCLAW_STATE_DIR/logs/cache-trace.jsonl`).
- `cacheTrace.includeMessages` / `includePrompt` / `includeSystem`: control what is included in cache trace output (all default: `true`).

View File

@@ -79,6 +79,9 @@ when `diagnostics.otel.enabled` is true.
otel: {
enabled: true,
endpoint: "http://otel-collector:4318",
tracesEndpoint: "http://otel-collector:4318/v1/traces",
metricsEndpoint: "http://otel-collector:4318/v1/metrics",
logsEndpoint: "http://otel-collector:4318/v1/logs",
protocol: "http/protobuf", // grpc is ignored
serviceName: "openclaw-gateway",
headers: { "x-collector-token": "..." },
@@ -102,13 +105,14 @@ when `diagnostics.otel.enabled` is true.
### Environment variables
| Variable | Purpose |
| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `OTEL_EXPORTER_OTLP_ENDPOINT` | Override `diagnostics.otel.endpoint`. If the value already contains `/v1/traces`, `/v1/metrics`, or `/v1/logs`, it is used as-is. |
| `OTEL_SERVICE_NAME` | Override `diagnostics.otel.serviceName`. |
| `OTEL_EXPORTER_OTLP_PROTOCOL` | Override the wire protocol (only `http/protobuf` is honored today). |
| `OTEL_SEMCONV_STABILITY_OPT_IN` | Set to `gen_ai_latest_experimental` to emit the latest experimental GenAI span attribute (`gen_ai.provider.name`) instead of the legacy `gen_ai.system`. GenAI metrics always use bounded, low-cardinality semantic attributes regardless. |
| `OPENCLAW_OTEL_PRELOADED` | Set to `1` when another preload or host process already registered the global OpenTelemetry SDK. The plugin then skips its own NodeSDK lifecycle but still wires diagnostic listeners and honors `traces`/`metrics`/`logs`. |
| Variable | Purpose |
| ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `OTEL_EXPORTER_OTLP_ENDPOINT` | Override `diagnostics.otel.endpoint`. If the value already contains `/v1/traces`, `/v1/metrics`, or `/v1/logs`, it is used as-is. |
| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` / `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` / `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | Signal-specific endpoint overrides used when the matching `diagnostics.otel.*Endpoint` config key is unset. Signal-specific config wins over signal-specific env, which wins over the shared endpoint. |
| `OTEL_SERVICE_NAME` | Override `diagnostics.otel.serviceName`. |
| `OTEL_EXPORTER_OTLP_PROTOCOL` | Override the wire protocol (only `http/protobuf` is honored today). |
| `OTEL_SEMCONV_STABILITY_OPT_IN` | Set to `gen_ai_latest_experimental` to emit the latest experimental GenAI span attribute (`gen_ai.provider.name`) instead of the legacy `gen_ai.system`. GenAI metrics always use bounded, low-cardinality semantic attributes regardless. |
| `OPENCLAW_OTEL_PRELOADED` | Set to `1` when another preload or host process already registered the global OpenTelemetry SDK. The plugin then skips its own NodeSDK lifecycle but still wires diagnostic listeners and honors `traces`/`metrics`/`logs`. |
## Privacy and content capture

View File

@@ -41,6 +41,8 @@ const sdkShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined));
const logEmit = vi.hoisted(() => vi.fn());
const logShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined));
const traceExporterCtor = vi.hoisted(() => vi.fn());
const metricExporterCtor = vi.hoisted(() => vi.fn());
const logExporterCtor = vi.hoisted(() => vi.fn());
vi.mock("@opentelemetry/api", () => ({
context: {
@@ -70,7 +72,9 @@ vi.mock("@opentelemetry/sdk-node", () => ({
}));
vi.mock("@opentelemetry/exporter-metrics-otlp-proto", () => ({
OTLPMetricExporter: function OTLPMetricExporter() {},
OTLPMetricExporter: function OTLPMetricExporter(options?: unknown) {
metricExporterCtor(options);
},
}));
vi.mock("@opentelemetry/exporter-trace-otlp-proto", () => ({
@@ -80,7 +84,9 @@ vi.mock("@opentelemetry/exporter-trace-otlp-proto", () => ({
}));
vi.mock("@opentelemetry/exporter-logs-otlp-proto", () => ({
OTLPLogExporter: function OTLPLogExporter() {},
OTLPLogExporter: function OTLPLogExporter(options?: unknown) {
logExporterCtor(options);
},
}));
vi.mock("@opentelemetry/sdk-logs", () => ({
@@ -133,6 +139,10 @@ const PROTO_KEY = "__proto__";
const MAX_TEST_OTEL_CONTENT_ATTRIBUTE_CHARS = 4096;
const OTEL_TRUNCATED_SUFFIX_MAX_CHARS = 20;
const ORIGINAL_OPENCLAW_OTEL_PRELOADED = process.env.OPENCLAW_OTEL_PRELOADED;
const ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
const ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT =
process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
const ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
const ORIGINAL_OTEL_SEMCONV_STABILITY_OPT_IN = process.env.OTEL_SEMCONV_STABILITY_OPT_IN;
function createLogger() {
@@ -173,7 +183,10 @@ function createOtelContext(
},
logger: createLogger(),
stateDir: OTEL_TEST_STATE_DIR,
internalDiagnostics: { onEvent: onInternalDiagnosticEvent },
internalDiagnostics: {
emit: emitTrustedDiagnosticEvent,
onEvent: onInternalDiagnosticEvent,
},
};
}
@@ -220,6 +233,11 @@ describe("diagnostics-otel service", () => {
logEmit.mockReset();
logShutdown.mockClear();
traceExporterCtor.mockClear();
metricExporterCtor.mockClear();
logExporterCtor.mockClear();
delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
});
afterEach(() => {
@@ -233,6 +251,22 @@ describe("diagnostics-otel service", () => {
} else {
process.env.OTEL_SEMCONV_STABILITY_OPT_IN = ORIGINAL_OTEL_SEMCONV_STABILITY_OPT_IN;
}
if (ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT === undefined) {
delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
} else {
process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = ORIGINAL_OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
}
if (ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT === undefined) {
delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
} else {
process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT =
ORIGINAL_OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
}
if (ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT === undefined) {
delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
} else {
process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = ORIGINAL_OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
}
});
test("records message-flow metrics and spans", async () => {
@@ -395,6 +429,124 @@ describe("diagnostics-otel service", () => {
expect(logShutdown).toHaveBeenCalledTimes(1);
});
test("emits and records bounded telemetry exporter health events", async () => {
const events: Array<Parameters<Parameters<typeof onInternalDiagnosticEvent>[0]>[0]> = [];
const unsubscribe = onInternalDiagnosticEvent((event) => {
if (event.type === "telemetry.exporter") {
events.push(event);
}
});
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
await service.start(ctx);
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: "telemetry.exporter",
exporter: "diagnostics-otel",
signal: "traces",
status: "started",
reason: "configured",
}),
expect.objectContaining({
type: "telemetry.exporter",
exporter: "diagnostics-otel",
signal: "metrics",
status: "started",
reason: "configured",
}),
expect.objectContaining({
type: "telemetry.exporter",
exporter: "diagnostics-otel",
signal: "logs",
status: "started",
reason: "configured",
}),
]),
);
expect(
telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add,
).toHaveBeenCalledWith(1, {
"openclaw.exporter": "diagnostics-otel",
"openclaw.signal": "logs",
"openclaw.status": "started",
"openclaw.reason": "configured",
});
unsubscribe();
await service.stop?.(ctx);
});
test("reports log exporter emit failures without exporting raw error text", async () => {
const events: Array<Parameters<Parameters<typeof onInternalDiagnosticEvent>[0]>[0]> = [];
const unsubscribe = onInternalDiagnosticEvent((event) => {
if (event.type === "telemetry.exporter") {
events.push(event);
}
});
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { logs: true });
logEmit.mockImplementationOnce(() => {
throw new TypeError("token sk-test-secret should not leave as telemetry");
});
await service.start(ctx);
emitDiagnosticEvent({
type: "log.record",
level: "INFO",
message: "export me",
});
await flushDiagnosticEvents();
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: "telemetry.exporter",
exporter: "diagnostics-otel",
signal: "logs",
status: "failure",
reason: "emit_failed",
errorCategory: "TypeError",
}),
]),
);
expect(
telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add,
).toHaveBeenCalledWith(1, {
"openclaw.exporter": "diagnostics-otel",
"openclaw.signal": "logs",
"openclaw.status": "failure",
"openclaw.reason": "emit_failed",
"openclaw.errorCategory": "TypeError",
});
unsubscribe();
await service.stop?.(ctx);
});
test("ignores untrusted telemetry exporter events for OTEL metrics", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
await service.start(ctx);
telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add.mockClear();
emitDiagnosticEvent({
type: "telemetry.exporter",
exporter: "spoofed-plugin-exporter",
signal: "metrics",
status: "failure",
reason: "emit_failed",
});
expect(
telemetryState.counters.get("openclaw.telemetry.exporter.events")?.add,
).not.toHaveBeenCalled();
await service.stop?.(ctx);
});
test("honors disabled traces when an OpenTelemetry SDK is preloaded", async () => {
process.env.OPENCLAW_OTEL_PRELOADED = "1";
const service = createDiagnosticsOtelService();
@@ -489,6 +641,50 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("uses signal-specific OTLP endpoints ahead of the shared endpoint", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, {
traces: true,
metrics: true,
logs: true,
});
ctx.config.diagnostics!.otel!.tracesEndpoint = "https://trace.example.com/otlp";
ctx.config.diagnostics!.otel!.metricsEndpoint = "https://metric.example.com/v1/metrics";
ctx.config.diagnostics!.otel!.logsEndpoint = "https://log.example.com/otlp";
await service.start(ctx);
const traceOptions = traceExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
const metricOptions = metricExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
const logOptions = logExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
expect(traceOptions?.url).toBe("https://trace.example.com/otlp/v1/traces");
expect(metricOptions?.url).toBe("https://metric.example.com/v1/metrics");
expect(logOptions?.url).toBe("https://log.example.com/otlp/v1/logs");
await service.stop?.(ctx);
});
test("uses signal-specific OTLP env endpoints when config is unset", async () => {
process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = "https://trace-env.example.com/v1/traces";
process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = "https://metric-env.example.com/otlp";
process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT = "https://log-env.example.com/otlp";
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, {
traces: true,
metrics: true,
logs: true,
});
await service.start(ctx);
const traceOptions = traceExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
const metricOptions = metricExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
const logOptions = logExporterCtor.mock.calls[0]?.[0] as { url?: string } | undefined;
expect(traceOptions?.url).toBe("https://trace-env.example.com/v1/traces");
expect(metricOptions?.url).toBe("https://metric-env.example.com/otlp/v1/metrics");
expect(logOptions?.url).toBe("https://log-env.example.com/otlp/v1/logs");
await service.stop?.(ctx);
});
test("redacts sensitive data from log messages before export", async () => {
const emitCall = await emitAndCaptureLog({
level: "INFO",

View File

@@ -50,6 +50,10 @@ const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
const OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT";
const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT";
const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT";
const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT";
const OTEL_SEMCONV_STABILITY_OPT_IN_ENV = "OTEL_SEMCONV_STABILITY_OPT_IN";
const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental";
const GEN_AI_TOKEN_USAGE_BUCKETS = [
@@ -77,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "model.call.completed" | "model.call.error" }
>;
type TelemetryExporterDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "telemetry.exporter" }
>;
const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = {
inputMessages: false,
@@ -102,6 +110,18 @@ function resolveOtelUrl(endpoint: string | undefined, path: string): string | un
return `${endpoint}/${path}`;
}
function resolveSignalOtelUrl(params: {
signalEndpoint?: string;
signalEnvEndpoint?: string;
endpoint?: string;
path: string;
}): string | undefined {
return resolveOtelUrl(
normalizeEndpoint(params.signalEndpoint ?? params.signalEnvEndpoint) ?? params.endpoint,
params.path,
);
}
function resolveSampleRate(value: number | undefined): number | undefined {
if (typeof value !== "number" || !Number.isFinite(value)) {
return undefined;
@@ -126,6 +146,17 @@ function formatError(err: unknown): string {
}
}
function errorCategory(err: unknown): string {
try {
if (err instanceof Error && typeof err.name === "string" && err.name.trim()) {
return lowCardinalityAttr(err.name, "Error");
}
return lowCardinalityAttr(typeof err, "unknown");
} catch {
return "unknown";
}
}
function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
const redactedAttributes: Record<string, string | number | boolean> = {};
for (const [key, value] of Object.entries(attributes)) {
@@ -513,35 +544,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
return;
}
const emitExporterEvent = (
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts">,
) => {
try {
ctx.internalDiagnostics?.emit({
type: "telemetry.exporter",
...event,
});
} catch {
// Exporter health must never affect the exporter lifecycle.
}
};
const emitForSignals = (
signals: TelemetryExporterDiagnosticEvent["signal"][],
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts" | "signal">,
) => {
for (const signal of signals) {
emitExporterEvent({ signal, ...event });
}
};
const tracesEnabled = otel.traces !== false;
const metricsEnabled = otel.metrics !== false;
const logsEnabled = otel.logs === true;
const enabledSignals: TelemetryExporterDiagnosticEvent["signal"][] = [
...(tracesEnabled ? (["traces"] as const) : []),
...(metricsEnabled ? (["metrics"] as const) : []),
...(logsEnabled ? (["logs"] as const) : []),
];
if (enabledSignals.length === 0) {
return;
}
const protocol = otel.protocol ?? process.env.OTEL_EXPORTER_OTLP_PROTOCOL ?? "http/protobuf";
if (protocol !== "http/protobuf") {
emitForSignals(enabledSignals, {
exporter: "diagnostics-otel",
status: "failure",
reason: "unsupported_protocol",
});
ctx.logger.warn(`diagnostics-otel: unsupported protocol ${protocol}`);
return;
}
const endpoint = normalizeEndpoint(otel.endpoint ?? process.env.OTEL_EXPORTER_OTLP_ENDPOINT);
const endpoint = normalizeEndpoint(
otel.endpoint ?? process.env[OTEL_EXPORTER_OTLP_ENDPOINT_ENV],
);
const headers = otel.headers ?? undefined;
const serviceName =
otel.serviceName?.trim() || process.env.OTEL_SERVICE_NAME || DEFAULT_SERVICE_NAME;
const sampleRate = resolveSampleRate(otel.sampleRate);
const contentCapturePolicy = resolveContentCapturePolicy(otel.captureContent);
const tracesEnabled = otel.traces !== false;
const metricsEnabled = otel.metrics !== false;
const logsEnabled = otel.logs === true;
if (!tracesEnabled && !metricsEnabled && !logsEnabled) {
return;
}
const sdkPreloaded = hasPreloadedOtelSdk();
const resource = resourceFromAttributes({
[ATTR_SERVICE_NAME]: serviceName,
});
const logUrl = resolveOtelUrl(endpoint, "v1/logs");
const logUrl = resolveSignalOtelUrl({
signalEndpoint: otel.logsEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV],
endpoint,
path: "v1/logs",
});
if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
const traceUrl = resolveSignalOtelUrl({
signalEndpoint: otel.tracesEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV],
endpoint,
path: "v1/traces",
});
const metricUrl = resolveSignalOtelUrl({
signalEndpoint: otel.metricsEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV],
endpoint,
path: "v1/metrics",
});
const traceExporter = tracesEnabled
? new OTLPTraceExporter({
...(traceUrl ? { url: traceUrl } : {}),
@@ -581,6 +659,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
try {
sdk.start();
} catch (err) {
emitForSignals(
[
...(tracesEnabled ? (["traces"] as const) : []),
...(metricsEnabled ? (["metrics"] as const) : []),
],
{
exporter: "diagnostics-otel",
status: "failure",
reason: "start_failed",
errorCategory: errorCategory(err),
},
);
await stopStarted();
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
throw err;
@@ -750,6 +840,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
unit: "1",
description: "Diagnostic memory pressure events",
});
const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
unit: "1",
description: "Diagnostic telemetry exporter lifecycle and failure events",
});
let recordLogRecord:
| ((
@@ -814,6 +908,13 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
}
otelLogger.emit(logRecord);
} catch (err) {
emitExporterEvent({
exporter: "diagnostics-otel",
signal: "logs",
status: "failure",
reason: "emit_failed",
errorCategory: errorCategory(err),
});
const now = Date.now();
if (
now - logRecordExportFailureLastReportedAt >=
@@ -1569,6 +1670,24 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
};
const recordTelemetryExporter = (
evt: TelemetryExporterDiagnosticEvent,
metadata: DiagnosticEventMetadata,
) => {
if (!metadata.trusted) {
return;
}
telemetryExporterCounter.add(1, {
"openclaw.exporter": lowCardinalityAttr(evt.exporter, "unknown"),
"openclaw.signal": evt.signal,
"openclaw.status": evt.status,
...(evt.reason ? { "openclaw.reason": evt.reason } : {}),
...(evt.errorCategory
? { "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other") }
: {}),
});
};
const subscribe = ctx.internalDiagnostics?.onEvent;
if (!subscribe) {
ctx.logger.error("diagnostics-otel: internal diagnostics capability unavailable");
@@ -1656,6 +1775,9 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
case "diagnostic.memory.pressure":
recordMemoryPressure(evt);
return;
case "telemetry.exporter":
recordTelemetryExporter(evt, metadata);
return;
case "tool.execution.started":
case "run.started":
case "model.call.started":
@@ -1669,6 +1791,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
}
});
emitForSignals(enabledSignals, {
exporter: "diagnostics-otel",
status: "started",
reason: "configured",
});
if (logsEnabled) {
ctx.logger.info("diagnostics-otel: logs exporter enabled (OTLP/Protobuf)");
}

View File

@@ -166,6 +166,24 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
description:
"Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.",
},
tracesEndpoint: {
type: "string",
title: "OpenTelemetry Traces Endpoint",
description:
"Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.",
},
metricsEndpoint: {
type: "string",
title: "OpenTelemetry Metrics Endpoint",
description:
"Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.",
},
logsEndpoint: {
type: "string",
title: "OpenTelemetry Logs Endpoint",
description:
"Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.",
},
protocol: {
anyOf: [
{
@@ -23453,6 +23471,21 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
help: "Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.",
tags: ["observability"],
},
"diagnostics.otel.tracesEndpoint": {
label: "OpenTelemetry Traces Endpoint",
help: "Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.",
tags: ["observability"],
},
"diagnostics.otel.metricsEndpoint": {
label: "OpenTelemetry Metrics Endpoint",
help: "Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.",
tags: ["observability"],
},
"diagnostics.otel.logsEndpoint": {
label: "OpenTelemetry Logs Endpoint",
help: "Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.",
tags: ["observability"],
},
"diagnostics.otel.protocol": {
label: "OpenTelemetry Protocol",
help: 'OTel transport protocol for telemetry export: "http/protobuf" or "grpc" depending on collector support. Use the protocol your observability backend expects to avoid dropped telemetry payloads.',

View File

@@ -534,10 +534,13 @@ const FINAL_BACKLOG_TARGET_KEYS = [
"diagnostics.otel.endpoint",
"diagnostics.otel.flushIntervalMs",
"diagnostics.otel.headers",
"diagnostics.otel.logsEndpoint",
"diagnostics.otel.logs",
"diagnostics.otel.metricsEndpoint",
"diagnostics.otel.metrics",
"diagnostics.otel.sampleRate",
"diagnostics.otel.serviceName",
"diagnostics.otel.tracesEndpoint",
"diagnostics.otel.traces",
"gateway.remote.password",
"gateway.remote.token",

View File

@@ -566,6 +566,12 @@ export const FIELD_HELP: Record<string, string> = {
"Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.",
"diagnostics.otel.endpoint":
"Collector endpoint URL used for OpenTelemetry export transport, including scheme and port. Use a reachable, trusted collector endpoint and monitor ingestion errors after rollout.",
"diagnostics.otel.tracesEndpoint":
"Signal-specific OTLP/HTTP trace endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for trace export only.",
"diagnostics.otel.metricsEndpoint":
"Signal-specific OTLP/HTTP metrics endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for metrics export only.",
"diagnostics.otel.logsEndpoint":
"Signal-specific OTLP/HTTP logs endpoint. When set, this overrides diagnostics.otel.endpoint and OTEL_EXPORTER_OTLP_ENDPOINT for log export only.",
"diagnostics.otel.protocol":
'OTel transport protocol for telemetry export: "http/protobuf" or "grpc" depending on collector support. Use the protocol your observability backend expects to avoid dropped telemetry payloads.',
"diagnostics.otel.headers":

View File

@@ -40,6 +40,9 @@ export const FIELD_LABELS: Record<string, string> = {
"diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)",
"diagnostics.otel.enabled": "OpenTelemetry Enabled",
"diagnostics.otel.endpoint": "OpenTelemetry Endpoint",
"diagnostics.otel.tracesEndpoint": "OpenTelemetry Traces Endpoint",
"diagnostics.otel.metricsEndpoint": "OpenTelemetry Metrics Endpoint",
"diagnostics.otel.logsEndpoint": "OpenTelemetry Logs Endpoint",
"diagnostics.otel.protocol": "OpenTelemetry Protocol",
"diagnostics.otel.headers": "OpenTelemetry Headers",
"diagnostics.otel.serviceName": "OpenTelemetry Service Name",

View File

@@ -234,6 +234,9 @@ export type LoggingConfig = {
export type DiagnosticsOtelConfig = {
enabled?: boolean;
endpoint?: string;
tracesEndpoint?: string;
metricsEndpoint?: string;
logsEndpoint?: string;
protocol?: "http/protobuf" | "grpc";
headers?: Record<string, string>;
serviceName?: string;

View File

@@ -306,6 +306,9 @@ export const OpenClawSchema = z
.object({
enabled: z.boolean().optional(),
endpoint: z.string().optional(),
tracesEndpoint: z.string().optional(),
metricsEndpoint: z.string().optional(),
logsEndpoint: z.string().optional(),
protocol: z.union([z.literal("http/protobuf"), z.literal("grpc")]).optional(),
headers: z.record(z.string(), z.string()).optional(),
serviceName: z.string().optional(),

View File

@@ -352,6 +352,22 @@ export type DiagnosticLogRecordEvent = DiagnosticBaseEvent & {
};
};
export type DiagnosticTelemetryExporterEvent = DiagnosticBaseEvent & {
type: "telemetry.exporter";
exporter: string;
signal: "traces" | "metrics" | "logs";
status: "started" | "failure" | "dropped";
reason?:
| "configured"
| "emit_failed"
| "handler_failed"
| "queue_full"
| "shutdown_failed"
| "start_failed"
| "unsupported_protocol";
errorCategory?: string;
};
export type DiagnosticEventPayload =
| DiagnosticUsageEvent
| DiagnosticWebhookReceivedEvent
@@ -382,7 +398,8 @@ export type DiagnosticEventPayload =
| DiagnosticMemorySampleEvent
| DiagnosticMemoryPressureEvent
| DiagnosticPayloadLargeEvent
| DiagnosticLogRecordEvent;
| DiagnosticLogRecordEvent
| DiagnosticTelemetryExporterEvent;
export type DiagnosticEventInput = DiagnosticEventPayload extends infer Event
? Event extends DiagnosticEventPayload

View File

@@ -345,6 +345,12 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
record.pluginId = event.pluginId;
assignReasonCode(record, event.reason);
break;
case "telemetry.exporter":
record.source = event.exporter;
record.target = event.signal;
record.outcome = event.status;
assignReasonCode(record, event.reason ?? event.errorCategory);
break;
}
return record;

View File

@@ -189,6 +189,7 @@ describe("startPluginServices", () => {
});
expect(contexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function");
expect(contexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function");
const untrustedContexts: OpenClawPluginServiceContext[] = [];
const untrustedService = createTrackingService("diagnostics-otel", {

View File

@@ -1,6 +1,9 @@
import { STATE_DIR } from "../config/paths.js";
import type { OpenClawConfig } from "../config/types.openclaw.js";
import { onInternalDiagnosticEvent } from "../infra/diagnostic-events.js";
import {
emitTrustedDiagnosticEvent,
onInternalDiagnosticEvent,
} from "../infra/diagnostic-events.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import type { PluginServiceRegistration } from "./registry-types.js";
import type { PluginRegistry } from "./registry.js";
@@ -29,7 +32,12 @@ function createServiceContext(params: {
...(params.service?.origin === "bundled" &&
params.service.pluginId === "diagnostics-otel" &&
params.service.service.id === "diagnostics-otel"
? { internalDiagnostics: { onEvent: onInternalDiagnosticEvent } }
? {
internalDiagnostics: {
emit: emitTrustedDiagnosticEvent,
onEvent: onInternalDiagnosticEvent,
},
}
: {}),
};
}

View File

@@ -28,6 +28,7 @@ import type { GatewayRequestHandler } from "../gateway/server-methods/types.js";
import type { InternalHookHandler } from "../hooks/internal-hook-types.js";
import type { ImageGenerationProvider } from "../image-generation/types.js";
import type {
DiagnosticEventInput,
DiagnosticEventMetadata,
DiagnosticEventPayload,
} from "../infra/diagnostic-events.js";
@@ -1976,6 +1977,7 @@ export type OpenClawPluginServiceContext = {
stateDir: string;
logger: PluginLogger;
internalDiagnostics?: {
emit: (event: DiagnosticEventInput) => void;
onEvent: (
listener: (event: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => void,
) => () => void;