mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 10:20:42 +00:00
fix(diagnostics-otel): support preloaded sdk mode (#71450)
This commit is contained in:
@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Changes
|
||||
|
||||
- Diagnostics/OTEL: support `OPENCLAW_OTEL_PRELOADED=1` so the plugin can reuse an already-registered OpenTelemetry SDK while keeping OpenClaw diagnostic listeners wired. (#70424) Thanks @jlapenna.
|
||||
- Control UI: refine the agent Tool Access panel with compact live-tool chips,
|
||||
collapsible tool groups, direct per-tool toggles, and clearer runtime/source
|
||||
provenance. (#71405) Thanks @BunsDev.
|
||||
|
||||
@@ -896,6 +896,7 @@ Notes:
|
||||
- `otel.sampleRate`: trace sampling rate `0`–`1`.
|
||||
- `otel.flushIntervalMs`: periodic telemetry flush interval in ms.
|
||||
- `otel.captureContent`: opt-in raw content capture for OTEL span attributes. Defaults to off. Boolean `true` captures non-system message/tool content; the object form lets you enable `inputMessages`, `outputMessages`, `toolInputs`, `toolOutputs`, and `systemPrompt` explicitly.
|
||||
- `OPENCLAW_OTEL_PRELOADED=1`: environment toggle for hosts that already registered a global OpenTelemetry SDK. OpenClaw then skips plugin-owned SDK startup/shutdown while keeping diagnostic listeners active.
|
||||
- `cacheTrace.enabled`: log cache trace snapshots for embedded runs (default: `false`).
|
||||
- `cacheTrace.filePath`: output path for cache trace JSONL (default: `$OPENCLAW_STATE_DIR/logs/cache-trace.jsonl`).
|
||||
- `cacheTrace.includeMessages` / `includePrompt` / `includeSystem`: control what is included in cache trace output (all default: `true`).
|
||||
|
||||
@@ -307,6 +307,10 @@ Notes:
|
||||
- Set `headers` when your collector requires auth.
|
||||
- Environment variables supported: `OTEL_EXPORTER_OTLP_ENDPOINT`,
|
||||
`OTEL_SERVICE_NAME`, `OTEL_EXPORTER_OTLP_PROTOCOL`.
|
||||
- Set `OPENCLAW_OTEL_PRELOADED=1` when another preload or host process already
|
||||
registered the global OpenTelemetry SDK. In that mode the plugin does not start
|
||||
or shut down its own SDK, but it still wires OpenClaw diagnostic listeners and
|
||||
honors `diagnostics.otel.traces`, `metrics`, and `logs`.
|
||||
|
||||
### Exported metrics (names + types)
|
||||
|
||||
@@ -389,6 +393,8 @@ classes you opted into.
|
||||
`OTEL_EXPORTER_OTLP_ENDPOINT`.
|
||||
- If the endpoint already contains `/v1/traces` or `/v1/metrics`, it is used as-is.
|
||||
- If the endpoint already contains `/v1/logs`, it is used as-is for logs.
|
||||
- `OPENCLAW_OTEL_PRELOADED=1` reuses an externally registered OpenTelemetry SDK
|
||||
for traces/metrics instead of starting a plugin-owned NodeSDK.
|
||||
- `diagnostics.otel.logs` enables OTLP log export for the main logger output.
|
||||
|
||||
### Log export behavior
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { beforeEach, describe, expect, test, vi } from "vitest";
|
||||
import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
|
||||
|
||||
const telemetryState = vi.hoisted(() => {
|
||||
const counters = new Map<string, { add: ReturnType<typeof vi.fn> }>();
|
||||
@@ -125,6 +125,7 @@ const GRANDCHILD_SPAN_ID = "2222222222222222";
|
||||
const PROTO_KEY = "__proto__";
|
||||
const MAX_TEST_OTEL_CONTENT_ATTRIBUTE_CHARS = 4096;
|
||||
const OTEL_TRUNCATED_SUFFIX_MAX_CHARS = 20;
|
||||
const ORIGINAL_OPENCLAW_OTEL_PRELOADED = process.env.OPENCLAW_OTEL_PRELOADED;
|
||||
|
||||
function createLogger() {
|
||||
return {
|
||||
@@ -194,6 +195,7 @@ function flushDiagnosticEvents() {
|
||||
|
||||
describe("diagnostics-otel service", () => {
|
||||
beforeEach(() => {
|
||||
delete process.env.OPENCLAW_OTEL_PRELOADED;
|
||||
telemetryState.counters.clear();
|
||||
telemetryState.histograms.clear();
|
||||
telemetryState.spans.length = 0;
|
||||
@@ -208,6 +210,14 @@ describe("diagnostics-otel service", () => {
|
||||
traceExporterCtor.mockClear();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (ORIGINAL_OPENCLAW_OTEL_PRELOADED === undefined) {
|
||||
delete process.env.OPENCLAW_OTEL_PRELOADED;
|
||||
} else {
|
||||
process.env.OPENCLAW_OTEL_PRELOADED = ORIGINAL_OPENCLAW_OTEL_PRELOADED;
|
||||
}
|
||||
});
|
||||
|
||||
test("records message-flow metrics and spans", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
|
||||
@@ -318,6 +328,84 @@ describe("diagnostics-otel service", () => {
|
||||
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => {
|
||||
process.env.OPENCLAW_OTEL_PRELOADED = "1";
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
|
||||
await service.start(ctx);
|
||||
|
||||
expect(sdkStart).not.toHaveBeenCalled();
|
||||
expect(traceExporterCtor).not.toHaveBeenCalled();
|
||||
expect(ctx.logger.info).toHaveBeenCalledWith(
|
||||
"diagnostics-otel: using preloaded OpenTelemetry SDK",
|
||||
);
|
||||
|
||||
emitDiagnosticEvent({
|
||||
type: "run.completed",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
});
|
||||
emitDiagnosticEvent({
|
||||
type: "log.record",
|
||||
level: "INFO",
|
||||
message: "preloaded log",
|
||||
});
|
||||
await flushDiagnosticEvents();
|
||||
|
||||
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
|
||||
100,
|
||||
expect.objectContaining({
|
||||
"openclaw.provider": "openai",
|
||||
"openclaw.model": "gpt-5.4",
|
||||
}),
|
||||
);
|
||||
expect(telemetryState.tracer.startSpan).toHaveBeenCalledWith(
|
||||
"openclaw.run",
|
||||
expect.objectContaining({
|
||||
attributes: expect.objectContaining({
|
||||
"openclaw.outcome": "completed",
|
||||
}),
|
||||
}),
|
||||
undefined,
|
||||
);
|
||||
expect(logEmit).toHaveBeenCalled();
|
||||
|
||||
await service.stop?.(ctx);
|
||||
expect(sdkShutdown).not.toHaveBeenCalled();
|
||||
expect(logShutdown).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
test("honors disabled traces when an OpenTelemetry SDK is preloaded", async () => {
|
||||
process.env.OPENCLAW_OTEL_PRELOADED = "1";
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: false, metrics: true });
|
||||
await service.start(ctx);
|
||||
|
||||
emitDiagnosticEvent({
|
||||
type: "run.completed",
|
||||
runId: "run-1",
|
||||
provider: "openai",
|
||||
model: "gpt-5.4",
|
||||
outcome: "completed",
|
||||
durationMs: 100,
|
||||
});
|
||||
|
||||
expect(sdkStart).not.toHaveBeenCalled();
|
||||
expect(telemetryState.histograms.get("openclaw.run.duration_ms")?.record).toHaveBeenCalledWith(
|
||||
100,
|
||||
expect.objectContaining({
|
||||
"openclaw.provider": "openai",
|
||||
}),
|
||||
);
|
||||
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
|
||||
|
||||
await service.stop?.(ctx);
|
||||
expect(sdkShutdown).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("tears down active handles when restarted with diagnostics disabled", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, {
|
||||
|
||||
@@ -49,6 +49,7 @@ const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000;
|
||||
const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
|
||||
const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
|
||||
const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
|
||||
const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
|
||||
|
||||
type OtelContentCapturePolicy = {
|
||||
inputMessages: boolean;
|
||||
@@ -164,6 +165,10 @@ function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy {
|
||||
};
|
||||
}
|
||||
|
||||
function hasPreloadedOtelSdk(): boolean {
|
||||
return process.env[PRELOADED_OTEL_SDK_ENV] === "1";
|
||||
}
|
||||
|
||||
function normalizeOtelContentValue(value: unknown): string | undefined {
|
||||
if (typeof value === "string") {
|
||||
return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
|
||||
@@ -400,38 +405,39 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
if (!tracesEnabled && !metricsEnabled && !logsEnabled) {
|
||||
return;
|
||||
}
|
||||
const sdkPreloaded = hasPreloadedOtelSdk();
|
||||
|
||||
const resource = resourceFromAttributes({
|
||||
[ATTR_SERVICE_NAME]: serviceName,
|
||||
});
|
||||
|
||||
const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
|
||||
const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
|
||||
const logUrl = resolveOtelUrl(endpoint, "v1/logs");
|
||||
const traceExporter = tracesEnabled
|
||||
? new OTLPTraceExporter({
|
||||
...(traceUrl ? { url: traceUrl } : {}),
|
||||
...(headers ? { headers } : {}),
|
||||
})
|
||||
: undefined;
|
||||
if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
|
||||
const traceUrl = resolveOtelUrl(endpoint, "v1/traces");
|
||||
const metricUrl = resolveOtelUrl(endpoint, "v1/metrics");
|
||||
const traceExporter = tracesEnabled
|
||||
? new OTLPTraceExporter({
|
||||
...(traceUrl ? { url: traceUrl } : {}),
|
||||
...(headers ? { headers } : {}),
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const metricExporter = metricsEnabled
|
||||
? new OTLPMetricExporter({
|
||||
...(metricUrl ? { url: metricUrl } : {}),
|
||||
...(headers ? { headers } : {}),
|
||||
})
|
||||
: undefined;
|
||||
const metricExporter = metricsEnabled
|
||||
? new OTLPMetricExporter({
|
||||
...(metricUrl ? { url: metricUrl } : {}),
|
||||
...(headers ? { headers } : {}),
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const metricReader = metricExporter
|
||||
? new PeriodicExportingMetricReader({
|
||||
exporter: metricExporter,
|
||||
...(typeof otel.flushIntervalMs === "number"
|
||||
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
|
||||
: {}),
|
||||
})
|
||||
: undefined;
|
||||
const metricReader = metricExporter
|
||||
? new PeriodicExportingMetricReader({
|
||||
exporter: metricExporter,
|
||||
...(typeof otel.flushIntervalMs === "number"
|
||||
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
|
||||
: {}),
|
||||
})
|
||||
: undefined;
|
||||
|
||||
if (tracesEnabled || metricsEnabled) {
|
||||
sdk = new NodeSDK({
|
||||
resource,
|
||||
...(traceExporter ? { traceExporter } : {}),
|
||||
@@ -452,6 +458,8 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
|
||||
throw err;
|
||||
}
|
||||
} else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) {
|
||||
ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK");
|
||||
}
|
||||
|
||||
const logSeverityMap: Record<string, SeverityNumber> = {
|
||||
|
||||
Reference in New Issue
Block a user