fix(diagnostics): make otel service restart safe

Make diagnostics-otel startup restart-safe by tearing down stale SDK, log transport, and diagnostic-event listener handles before reinitializing or disabling the service. Adds regression coverage for repeated start and disabled restart paths.\n\nThanks @vincentkoc.
This commit is contained in:
Vincent Koc
2026-04-24 00:53:04 -07:00
committed by GitHub
parent cc92699ff7
commit 4630ce3d9e
3 changed files with 97 additions and 14 deletions

View File

@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
- Diagnostics/OTEL: add a lightweight diagnostic trace-context carrier for future span correlation without adding OTEL SDK state to core. Thanks @vincentkoc.
- Diagnostics/OTEL: attach diagnostic trace context to exported OTEL logs so log records can correlate with future spans without adding retained process state. Thanks @vincentkoc.
- Diagnostics/OTEL: pass immutable per-run diagnostic trace context through agent and tool hook contexts, and parent exported diagnostic spans from validated context without retaining global trace state. Thanks @vincentkoc.
- Diagnostics/OTEL: make exporter startup restart-safe so config reloads do not retain stale SDKs, log transports, or diagnostic event listeners. Thanks @vincentkoc.
- Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it.
- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent.
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.

View File

@@ -166,12 +166,14 @@ function createTraceOnlyContext(endpoint: string): OpenClawPluginServiceContext
type RegisteredLogTransport = (logObj: Record<string, unknown>) => void;
function setupRegisteredTransports() {
const registeredTransports: RegisteredLogTransport[] = [];
const stopTransport = vi.fn();
const stopTransports: ReturnType<typeof vi.fn>[] = [];
registerLogTransportMock.mockImplementation((transport) => {
registeredTransports.push(transport);
const stopTransport = vi.fn();
stopTransports.push(stopTransport);
return stopTransport;
});
return { registeredTransports, stopTransport };
return { registeredTransports, stopTransports };
}
async function emitAndCaptureLog(logObj: Record<string, unknown>) {
@@ -283,6 +285,73 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("restarts without retaining prior listeners or log transports", async () => {
const { registeredTransports, stopTransports } = setupRegisteredTransports();
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
await service.start(ctx);
await service.start(ctx);
expect(registerLogTransportMock).toHaveBeenCalledTimes(2);
expect(registeredTransports).toHaveLength(2);
expect(stopTransports[0]).toHaveBeenCalledTimes(1);
expect(logShutdown).toHaveBeenCalledTimes(1);
expect(sdkShutdown).toHaveBeenCalledTimes(1);
telemetryState.tracer.startSpan.mockClear();
emitDiagnosticEvent({
type: "message.processed",
channel: "telegram",
outcome: "completed",
durationMs: 10,
});
expect(telemetryState.tracer.startSpan).toHaveBeenCalledTimes(1);
await service.stop?.(ctx);
expect(stopTransports[1]).toHaveBeenCalledTimes(1);
expect(logShutdown).toHaveBeenCalledTimes(2);
expect(sdkShutdown).toHaveBeenCalledTimes(2);
telemetryState.tracer.startSpan.mockClear();
emitDiagnosticEvent({
type: "message.processed",
channel: "telegram",
outcome: "completed",
durationMs: 10,
});
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
});
test("tears down active handles when restarted with diagnostics disabled", async () => {
const { stopTransports } = setupRegisteredTransports();
const service = createDiagnosticsOtelService();
const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, {
traces: true,
metrics: true,
logs: true,
});
await service.start(enabledCtx);
await service.start({
...enabledCtx,
config: { diagnostics: { enabled: false } },
});
expect(stopTransports[0]).toHaveBeenCalledTimes(1);
expect(logShutdown).toHaveBeenCalledTimes(1);
expect(sdkShutdown).toHaveBeenCalledTimes(1);
telemetryState.tracer.startSpan.mockClear();
emitDiagnosticEvent({
type: "message.processed",
channel: "telegram",
outcome: "completed",
durationMs: 10,
});
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
});
test("appends signal path when endpoint contains non-signal /v1 segment", async () => {
const service = createDiagnosticsOtelService();
const ctx = createTraceOnlyContext("https://www.comet.com/opik/api/v1/private/otel");

View File

@@ -176,9 +176,32 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
let stopLogTransport: (() => void) | null = null;
let unsubscribe: (() => void) | null = null;
const stopStarted = async () => {
const currentUnsubscribe = unsubscribe;
const currentStopLogTransport = stopLogTransport;
const currentLogProvider = logProvider;
const currentSdk = sdk;
unsubscribe = null;
stopLogTransport = null;
logProvider = null;
sdk = null;
currentUnsubscribe?.();
currentStopLogTransport?.();
if (currentLogProvider) {
await currentLogProvider.shutdown().catch(() => undefined);
}
if (currentSdk) {
await currentSdk.shutdown().catch(() => undefined);
}
};
return {
id: "diagnostics-otel",
async start(ctx) {
await stopStarted();
const cfg = ctx.config.diagnostics;
const otel = cfg?.otel;
if (!cfg?.enabled || !otel?.enabled) {
@@ -251,6 +274,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
try {
sdk.start();
} catch (err) {
await stopStarted();
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
throw err;
}
@@ -805,18 +829,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
}
},
async stop() {
unsubscribe?.();
unsubscribe = null;
stopLogTransport?.();
stopLogTransport = null;
if (logProvider) {
await logProvider.shutdown().catch(() => undefined);
logProvider = null;
}
if (sdk) {
await sdk.shutdown().catch(() => undefined);
sdk = null;
}
await stopStarted();
},
} satisfies OpenClawPluginService;
}