From 5813fa4584cf0cd4370d844f976db2332b3813ae Mon Sep 17 00:00:00 2001 From: "clawsweeper[bot]" <274271284+clawsweeper[bot]@users.noreply.github.com> Date: Thu, 21 May 2026 09:01:32 +0000 Subject: [PATCH] fix(diagnostics-otel): suppress exporter rejection crashes (#84881) Summary: - The PR adds a diagnostics-otel scoped unhandled-rejection handler for nested OTLPExporterError values, unregisters it on stop/restart, adds regression tests, and adds a changelog entry. - Reproducibility: yes. The source path is high-confidence: current main has no OTLPExporterError-specific dia ... ror for non-retryable OTLP HTTP failures; I did not run a live collector shutdown in this read-only review. Automerge notes: - PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): avoid stale exporter handler - PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): suppress exporter rejection crashes Validation: - ClawSweeper review passed for head e19c06c992748cbf10687a513362b533f9159e73. - Required merge gates passed before the squash merge. Prepared head SHA: e19c06c992748cbf10687a513362b533f9159e73 Review: https://github.com/openclaw/openclaw/pull/84881#issuecomment-4506249586 Co-authored-by: luoyanglang Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com> Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com> Approved-by: takhoffman Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com> --- CHANGELOG.md | 1 + .../diagnostics-otel/src/service.test.ts | 69 ++++++++++++++ extensions/diagnostics-otel/src/service.ts | 89 +++++++++++++++++++ 3 files changed, 159 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6c9ad82bf8..37028de7aa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - doctor: constrain legacy plugin cleanup paths [AI]. (#84801) Thanks @pgondhi987. - Update/doctor: prune stale local bundled plugin install records that point at old compiled bundled output so current bundled plugin schemas win after upgrade. (#84863) Thanks @fuller-stack-dev. - PDF tool: time out idle remote PDF body reads after 120 seconds so stalled remote documents return an error instead of wedging the session. Fixes #68649. (#84768) Thanks @luoyanglang. +- Diagnostics/OpenTelemetry plugin: suppress handled OTLP exporter promise rejections so collector shutdowns no longer crash the Gateway. (#81085) Thanks @luoyanglang. - Media/audio: skip empty structured sherpa-onnx transcripts instead of treating the raw JSON payload as spoken text. (#84667) Thanks @TurboTheTurtle. - Node/Linux: keep `OPENCLAW_GATEWAY_TOKEN` out of generated systemd unit files by writing node service token values to a node-specific env file. (#84408) - Memory-core/dreaming: reuse stable narrative subagent session keys per workspace and phase while keeping per-run idempotency and bounded cleanup, so stale `dreaming-narrative-*` sessions do not accumulate. Fixes #68252, #69187, and #70402. (#70464) Thanks @chiyouYCH. diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index 4e88a9b2b6f..bbfc565257f 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -53,6 +53,21 @@ const logShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined)); const traceExporterCtor = vi.hoisted(() => vi.fn()); const metricExporterCtor = vi.hoisted(() => vi.fn()); const logExporterCtor = vi.hoisted(() => vi.fn()); +const unhandledRejectionHandlerState = vi.hoisted(() => { + let handlers: Array<(reason: unknown) => boolean> = []; + return { + getHandlers: () => handlers, + register: vi.fn((handler: (reason: unknown) => boolean) => { + handlers.push(handler); + return () => { + handlers = handlers.filter((candidate) => candidate !== handler); + }; + }), + reset: () => { + handlers = []; + }, + }; +}); vi.mock("@opentelemetry/api", () => ({ context: { @@ -99,6 +114,10 @@ vi.mock("@opentelemetry/exporter-logs-otlp-proto", () => ({ }, })); +vi.mock("openclaw/plugin-sdk/runtime-env", () => ({ + registerUnhandledRejectionHandler: unhandledRejectionHandlerState.register, +})); + vi.mock("@opentelemetry/sdk-logs", () => ({ BatchLogRecordProcessor: function BatchLogRecordProcessor() {}, LoggerProvider: class { @@ -336,6 +355,8 @@ describe("diagnostics-otel service", () => { traceExporterCtor.mockClear(); metricExporterCtor.mockClear(); logExporterCtor.mockClear(); + unhandledRejectionHandlerState.reset(); + unhandledRejectionHandlerState.register.mockClear(); delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; @@ -531,6 +552,54 @@ describe("diagnostics-otel service", () => { expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled(); }); + test("registers and removes an OTLP exporter unhandled rejection handler", async () => { + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true }); + + await service.start(ctx); + + expect(unhandledRejectionHandlerState.register).toHaveBeenCalledTimes(1); + const handler = unhandledRejectionHandlerState.getHandlers()[0]; + expect(handler).toBeTypeOf("function"); + + const errorInstance = Object.assign(new Error("collector gone"), { + name: "OTLPExporterError", + code: 410, + }); + expect(handler?.(errorInstance)).toBe(true); + expect(handler?.({ name: "OTLPExporterError", code: 410, data: "user_stop" })).toBe(true); + expect(handler?.([{ name: "OTLPExporterError", code: 410, data: "user_stop" }])).toBe(true); + expect( + handler?.( + new AggregateError( + [{ name: "OTLPExporterError", code: 410, data: "user_stop" }], + "export failed", + ), + ), + ).toBe(true); + expect(handler?.(new Error("other exporter error"))).toBe(false); + expect(ctx.logger.warn).toHaveBeenCalledWith( + "diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=410)", + ); + + await service.stop?.(ctx); + expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0); + }); + + test("does not retain an OTLP exporter handler when startup setup fails", async () => { + const startupError = new Error("trace exporter setup failed"); + traceExporterCtor.mockImplementationOnce(() => { + throw startupError; + }); + const service = createDiagnosticsOtelService(); + const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true }); + + await expect(service.start(ctx)).rejects.toBe(startupError); + + expect(unhandledRejectionHandlerState.register).not.toHaveBeenCalled(); + expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0); + }); + test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => { process.env.OPENCLAW_OTEL_PRELOADED = "1"; const service = createDiagnosticsOtelService(); diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index def592bd25d..98a88fd2742 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -15,6 +15,7 @@ import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics"; import { NodeSDK } from "@opentelemetry/sdk-node"; import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base"; import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; +import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env"; import type { DiagnosticEventMetadata, DiagnosticEventPayload, @@ -169,6 +170,78 @@ function errorCategory(err: unknown): string { } } +function collectNestedErrorCandidates(err: unknown): unknown[] { + const queue: unknown[] = [err]; + const seen = new Set(); + const candidates: unknown[] = []; + + while (queue.length > 0) { + const current = queue.shift(); + if (current == null || seen.has(current)) { + continue; + } + seen.add(current); + candidates.push(current); + + if (Array.isArray(current)) { + for (const item of current) { + if (item != null && !seen.has(item)) { + queue.push(item); + } + } + continue; + } + if (typeof current !== "object") { + continue; + } + + const record = current as Record; + for (const nested of [record.cause, record.reason, record.original, record.error]) { + if (nested != null && !seen.has(nested)) { + queue.push(nested); + } + } + if (Array.isArray(record.errors)) { + for (const nested of record.errors) { + if (nested != null && !seen.has(nested)) { + queue.push(nested); + } + } + } + } + + return candidates; +} + +function readErrorName(err: unknown): string | undefined { + if (!err || typeof err !== "object") { + return undefined; + } + const name = (err as { name?: unknown }).name; + return typeof name === "string" && name.trim() ? name : undefined; +} + +function readErrorCode(err: unknown): string | number | undefined { + if (!err || typeof err !== "object") { + return undefined; + } + const code = (err as { code?: unknown }).code; + return typeof code === "string" || typeof code === "number" ? code : undefined; +} + +function findOtlpExporterError(reason: unknown): object | undefined { + for (const candidate of collectNestedErrorCandidates(reason)) { + if ( + readErrorName(candidate) === "OTLPExporterError" && + candidate && + typeof candidate === "object" + ) { + return candidate; + } + } + return undefined; +} + function redactOtelAttributes(attributes: Record) { const redactedAttributes: Record = {}; for (const [key, value] of Object.entries(attributes)) { @@ -524,18 +597,22 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { let logProvider: LoggerProvider | null = null; let unsubscribe: (() => void) | null = null; let stopActiveTrustedSpans: (() => void) | null = null; + let unregisterUnhandledRejectionHandler: (() => void) | null = null; const stopStarted = async () => { const currentUnsubscribe = unsubscribe; const currentLogProvider = logProvider; const currentSdk = sdk; const currentStopActiveTrustedSpans = stopActiveTrustedSpans; + const currentUnregisterUnhandledRejectionHandler = unregisterUnhandledRejectionHandler; unsubscribe = null; logProvider = null; sdk = null; stopActiveTrustedSpans = null; + unregisterUnhandledRejectionHandler = null; + currentUnregisterUnhandledRejectionHandler?.(); currentUnsubscribe?.(); currentStopActiveTrustedSpans?.(); if (currentLogProvider) { @@ -2471,6 +2548,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { } }); + unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((reason) => { + const otlpError = findOtlpExporterError(reason); + if (!otlpError) { + return false; + } + const code = readErrorCode(otlpError) ?? "unknown"; + ctx.logger.warn( + `diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=${String(code)})`, + ); + return true; + }); + emitForSignals(enabledSignals, { exporter: "diagnostics-otel", status: "started",