mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:10:49 +00:00
fix(diagnostics): make otel service restart safe
Make diagnostics-otel startup restart-safe by tearing down stale SDK, log transport, and diagnostic-event listener handles before reinitializing or disabling the service. Adds regression coverage for repeated start and disabled restart paths.\n\nThanks @vincentkoc.
This commit is contained in:
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Diagnostics/OTEL: add a lightweight diagnostic trace-context carrier for future span correlation without adding OTEL SDK state to core. Thanks @vincentkoc.
|
||||
- Diagnostics/OTEL: attach diagnostic trace context to exported OTEL logs so log records can correlate with future spans without adding retained process state. Thanks @vincentkoc.
|
||||
- Diagnostics/OTEL: pass immutable per-run diagnostic trace context through agent and tool hook contexts, and parent exported diagnostic spans from validated context without retaining global trace state. Thanks @vincentkoc.
|
||||
- Diagnostics/OTEL: make exporter startup restart-safe so config reloads do not retain stale SDKs, log transports, or diagnostic event listeners. Thanks @vincentkoc.
|
||||
- Control UI/chat: add a Steer action on queued messages so a browser follow-up can be injected into the active run without retyping it.
|
||||
- Control UI/Talk: add browser WebRTC realtime voice sessions backed by OpenAI Realtime, with Gateway-minted ephemeral client secrets and `openclaw_agent_consult` handoff to the full OpenClaw agent.
|
||||
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.
|
||||
|
||||
@@ -166,12 +166,14 @@ function createTraceOnlyContext(endpoint: string): OpenClawPluginServiceContext
|
||||
type RegisteredLogTransport = (logObj: Record<string, unknown>) => void;
|
||||
function setupRegisteredTransports() {
|
||||
const registeredTransports: RegisteredLogTransport[] = [];
|
||||
const stopTransport = vi.fn();
|
||||
const stopTransports: ReturnType<typeof vi.fn>[] = [];
|
||||
registerLogTransportMock.mockImplementation((transport) => {
|
||||
registeredTransports.push(transport);
|
||||
const stopTransport = vi.fn();
|
||||
stopTransports.push(stopTransport);
|
||||
return stopTransport;
|
||||
});
|
||||
return { registeredTransports, stopTransport };
|
||||
return { registeredTransports, stopTransports };
|
||||
}
|
||||
|
||||
async function emitAndCaptureLog(logObj: Record<string, unknown>) {
|
||||
@@ -283,6 +285,73 @@ describe("diagnostics-otel service", () => {
|
||||
await service.stop?.(ctx);
|
||||
});
|
||||
|
||||
test("restarts without retaining prior listeners or log transports", async () => {
|
||||
const { registeredTransports, stopTransports } = setupRegisteredTransports();
|
||||
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
|
||||
await service.start(ctx);
|
||||
await service.start(ctx);
|
||||
|
||||
expect(registerLogTransportMock).toHaveBeenCalledTimes(2);
|
||||
expect(registeredTransports).toHaveLength(2);
|
||||
expect(stopTransports[0]).toHaveBeenCalledTimes(1);
|
||||
expect(logShutdown).toHaveBeenCalledTimes(1);
|
||||
expect(sdkShutdown).toHaveBeenCalledTimes(1);
|
||||
|
||||
telemetryState.tracer.startSpan.mockClear();
|
||||
emitDiagnosticEvent({
|
||||
type: "message.processed",
|
||||
channel: "telegram",
|
||||
outcome: "completed",
|
||||
durationMs: 10,
|
||||
});
|
||||
expect(telemetryState.tracer.startSpan).toHaveBeenCalledTimes(1);
|
||||
|
||||
await service.stop?.(ctx);
|
||||
expect(stopTransports[1]).toHaveBeenCalledTimes(1);
|
||||
expect(logShutdown).toHaveBeenCalledTimes(2);
|
||||
expect(sdkShutdown).toHaveBeenCalledTimes(2);
|
||||
|
||||
telemetryState.tracer.startSpan.mockClear();
|
||||
emitDiagnosticEvent({
|
||||
type: "message.processed",
|
||||
channel: "telegram",
|
||||
outcome: "completed",
|
||||
durationMs: 10,
|
||||
});
|
||||
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("tears down active handles when restarted with diagnostics disabled", async () => {
|
||||
const { stopTransports } = setupRegisteredTransports();
|
||||
|
||||
const service = createDiagnosticsOtelService();
|
||||
const enabledCtx = createOtelContext(OTEL_TEST_ENDPOINT, {
|
||||
traces: true,
|
||||
metrics: true,
|
||||
logs: true,
|
||||
});
|
||||
await service.start(enabledCtx);
|
||||
await service.start({
|
||||
...enabledCtx,
|
||||
config: { diagnostics: { enabled: false } },
|
||||
});
|
||||
|
||||
expect(stopTransports[0]).toHaveBeenCalledTimes(1);
|
||||
expect(logShutdown).toHaveBeenCalledTimes(1);
|
||||
expect(sdkShutdown).toHaveBeenCalledTimes(1);
|
||||
|
||||
telemetryState.tracer.startSpan.mockClear();
|
||||
emitDiagnosticEvent({
|
||||
type: "message.processed",
|
||||
channel: "telegram",
|
||||
outcome: "completed",
|
||||
durationMs: 10,
|
||||
});
|
||||
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("appends signal path when endpoint contains non-signal /v1 segment", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createTraceOnlyContext("https://www.comet.com/opik/api/v1/private/otel");
|
||||
|
||||
@@ -176,9 +176,32 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
let stopLogTransport: (() => void) | null = null;
|
||||
let unsubscribe: (() => void) | null = null;
|
||||
|
||||
const stopStarted = async () => {
|
||||
const currentUnsubscribe = unsubscribe;
|
||||
const currentStopLogTransport = stopLogTransport;
|
||||
const currentLogProvider = logProvider;
|
||||
const currentSdk = sdk;
|
||||
|
||||
unsubscribe = null;
|
||||
stopLogTransport = null;
|
||||
logProvider = null;
|
||||
sdk = null;
|
||||
|
||||
currentUnsubscribe?.();
|
||||
currentStopLogTransport?.();
|
||||
if (currentLogProvider) {
|
||||
await currentLogProvider.shutdown().catch(() => undefined);
|
||||
}
|
||||
if (currentSdk) {
|
||||
await currentSdk.shutdown().catch(() => undefined);
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
id: "diagnostics-otel",
|
||||
async start(ctx) {
|
||||
await stopStarted();
|
||||
|
||||
const cfg = ctx.config.diagnostics;
|
||||
const otel = cfg?.otel;
|
||||
if (!cfg?.enabled || !otel?.enabled) {
|
||||
@@ -251,6 +274,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
try {
|
||||
sdk.start();
|
||||
} catch (err) {
|
||||
await stopStarted();
|
||||
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
|
||||
throw err;
|
||||
}
|
||||
@@ -805,18 +829,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
}
|
||||
},
|
||||
async stop() {
|
||||
unsubscribe?.();
|
||||
unsubscribe = null;
|
||||
stopLogTransport?.();
|
||||
stopLogTransport = null;
|
||||
if (logProvider) {
|
||||
await logProvider.shutdown().catch(() => undefined);
|
||||
logProvider = null;
|
||||
}
|
||||
if (sdk) {
|
||||
await sdk.shutdown().catch(() => undefined);
|
||||
sdk = null;
|
||||
}
|
||||
await stopStarted();
|
||||
},
|
||||
} satisfies OpenClawPluginService;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user