From 4630ce3d9e2f089062ccd386b0ae241325961d1a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 00:53:04 -0700 Subject: [PATCH] fix(diagnostics): make otel service restart safe Make diagnostics-otel startup restart-safe by tearing down stale SDK, log transport, and diagnostic-event listener handles before reinitializing or disabling the service. Adds regression coverage for repeated start and disabled restart paths.\n\nThanks @vincentkoc. --- CHANGELOG.md | 1 + .../diagnostics-otel/src/service.test.ts | 73 ++++++++++++++++++- extensions/diagnostics-otel/src/service.ts | 37 +++++++--- 3 files changed, 97 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4d30cd968f..f1d22fe3a94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai - Diagnostics/OTEL: add a lightweight diagnostic trace-context carrier for future span correlation without adding OTEL SDK state to core. Thanks @vincentkoc. - Diagnostics/OTEL: attach diagnostic trace context to exported OTEL logs so log records can correlate with future spans without adding retained process state. Thanks @vincentkoc. - Diagnostics/OTEL: pass immutable per-run diagnostic trace context through agent and tool hook contexts, and parent exported diagnostic spans from validated context without retaining global trace state. Thanks @vincentkoc. +- Diagnostics/OTEL: make exporter startup restart-safe so config reloads do not retain stale SDKs, log transports, or diagnostic event listeners. Thanks @vincentkoc. - Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it. - Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent. - Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it. diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index fd2e1c30e1d..e70c41a98bd 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -166,12 +166,14 @@ function createTraceOnlyContext(endpoint: string): OpenClawPluginServiceContext type RegisteredLogTransport = (logObj: Record) => void; function setupRegisteredTransports() { const registeredTransports: RegisteredLogTransport[] = []; - const stopTransport = vi.fn(); + const stopTransports: ReturnType[] = []; registerLogTransportMock.mockImplementation((transport) => { registeredTransports.push(transport); + const stopTransport = vi.fn(); + stopTransports.push(stopTransport); return stopTransport; }); - return { registeredTransports, stopTransport }; + return { registeredTransports, stopTransports }; } async function emitAndCaptureLog(logObj: Record) { @@ -283,6 +285,73 @@ describe("diagnostics-otel service", () => { await service.stop?.(ctx); }); + test("restarts without retaining prior listeners or log transports", async () => { + const { registeredTransports, stopTransports } = setupRegisteredTransports(); + + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true }); + await service.start(ctx); + await service.start(ctx); + + expect(registerLogTransportMock).toHaveBeenCalledTimes(2); + expect(registeredTransports).toHaveLength(2); + expect(stopTransports[0]).toHaveBeenCalledTimes(1); + expect(logShutdown).toHaveBeenCalledTimes(1); + expect(sdkShutdown).toHaveBeenCalledTimes(1); + + telemetryState.tracer.startSpan.mockClear(); + emitDiagnosticEvent({ + type: "message.processed", + channel: "telegram", + outcome: "completed", + durationMs: 10, + }); + expect(telemetryState.tracer.startSpan).toHaveBeenCalledTimes(1); + + await service.stop?.(ctx); + expect(stopTransports[1]).toHaveBeenCalledTimes(1); + expect(logShutdown).toHaveBeenCalledTimes(2); + expect(sdkShutdown).toHaveBeenCalledTimes(2); + + telemetryState.tracer.startSpan.mockClear(); + emitDiagnosticEvent({ + type: "message.processed", + channel: "telegram", + outcome: "completed", + durationMs: 10, + }); + expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled(); + }); + + test("tears down active handles when restarted with diagnostics disabled", async () => { + const { stopTransports } = setupRegisteredTransports(); + + const service = createDiagnosticsOtelService(); + const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, { + traces: true, + metrics: true, + logs: true, + }); + await service.start(enabledCtx); + await service.start({ + ...enabledCtx, + config: { diagnostics: { enabled: false } }, + }); + + expect(stopTransports[0]).toHaveBeenCalledTimes(1); + expect(logShutdown).toHaveBeenCalledTimes(1); + expect(sdkShutdown).toHaveBeenCalledTimes(1); + + telemetryState.tracer.startSpan.mockClear(); + emitDiagnosticEvent({ + type: "message.processed", + channel: "telegram", + outcome: "completed", + durationMs: 10, + }); + expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled(); + }); + test("appends signal path when endpoint contains non-signal /v1 segment", async () => { const service = createDiagnosticsOtelService(); const ctx = createTraceOnlyContext("https://www.comet.com/opik/api/v1/private/otel"); diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index ee4b8019e28..acef6fdd02a 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -176,9 +176,32 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { let stopLogTransport: (() => void) | null = null; let unsubscribe: (() => void) | null = null; + const stopStarted = async () => { + const currentUnsubscribe = unsubscribe; + const currentStopLogTransport = stopLogTransport; + const currentLogProvider = logProvider; + const currentSdk = sdk; + + unsubscribe = null; + stopLogTransport = null; + logProvider = null; + sdk = null; + + currentUnsubscribe?.(); + currentStopLogTransport?.(); + if (currentLogProvider) { + await currentLogProvider.shutdown().catch(() => undefined); + } + if (currentSdk) { + await currentSdk.shutdown().catch(() => undefined); + } + }; + return { id: "diagnostics-otel", async start(ctx) { + await stopStarted(); + const cfg = ctx.config.diagnostics; const otel = cfg?.otel; if (!cfg?.enabled || !otel?.enabled) { @@ -251,6 +274,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { try { sdk.start(); } catch (err) { + await stopStarted(); ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`); throw err; } @@ -805,18 +829,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { } }, async stop() { - unsubscribe?.(); - unsubscribe = null; - stopLogTransport?.(); - stopLogTransport = null; - if (logProvider) { - await logProvider.shutdown().catch(() => undefined); - logProvider = null; - } - if (sdk) { - await sdk.shutdown().catch(() => undefined); - sdk = null; - } + await stopStarted(); }, } satisfies OpenClawPluginService; }