fix(diagnostics-otel): suppress exporter rejection crashes (#84881)

Summary:
- The PR adds a diagnostics-otel scoped unhandled-rejection handler for nested OTLPExporterError values, unregisters it on stop/restart, adds regression tests, and adds a changelog entry.
- Reproducibility: yes. The source path is high-confidence: current main has no OTLPExporterError-specific dia ... ror for non-retryable OTLP HTTP failures; I did not run a live collector shutdown in this read-only review.

Automerge notes:
- PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): avoid stale exporter handler
- PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): suppress exporter rejection crashes

Validation:
- ClawSweeper review passed for head e19c06c992.
- Required merge gates passed before the squash merge.

Prepared head SHA: e19c06c992
Review: https://github.com/openclaw/openclaw/pull/84881#issuecomment-4506249586

Co-authored-by: luoyanglang <hanwanlonga@gmail.com>
Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com>
Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com>
Approved-by: takhoffman
Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com>
This commit is contained in:
clawsweeper[bot]
2026-05-21 09:01:32 +00:00
committed by GitHub
parent 233765b361
commit 5813fa4584
3 changed files with 159 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
- doctor: constrain legacy plugin cleanup paths [AI]. (#84801) Thanks @pgondhi987.
- Update/doctor: prune stale local bundled plugin install records that point at old compiled bundled output so current bundled plugin schemas win after upgrade. (#84863) Thanks @fuller-stack-dev.
- PDF tool: time out idle remote PDF body reads after 120 seconds so stalled remote documents return an error instead of wedging the session. Fixes #68649. (#84768) Thanks @luoyanglang.
- Diagnostics/OpenTelemetry plugin: suppress handled OTLP exporter promise rejections so collector shutdowns no longer crash the Gateway. (#81085) Thanks @luoyanglang.
- Media/audio: skip empty structured sherpa-onnx transcripts instead of treating the raw JSON payload as spoken text. (#84667) Thanks @TurboTheTurtle.
- Node/Linux: keep `OPENCLAW_GATEWAY_TOKEN` out of generated systemd unit files by writing node service token values to a node-specific env file. (#84408)
- Memory-core/dreaming: reuse stable narrative subagent session keys per workspace and phase while keeping per-run idempotency and bounded cleanup, so stale `dreaming-narrative-*` sessions do not accumulate. Fixes #68252, #69187, and #70402. (#70464) Thanks @chiyouYCH.

View File

@@ -53,6 +53,21 @@ const logShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined));
const traceExporterCtor = vi.hoisted(() => vi.fn());
const metricExporterCtor = vi.hoisted(() => vi.fn());
const logExporterCtor = vi.hoisted(() => vi.fn());
const unhandledRejectionHandlerState = vi.hoisted(() => {
let handlers: Array<(reason: unknown) => boolean> = [];
return {
getHandlers: () => handlers,
register: vi.fn((handler: (reason: unknown) => boolean) => {
handlers.push(handler);
return () => {
handlers = handlers.filter((candidate) => candidate !== handler);
};
}),
reset: () => {
handlers = [];
},
};
});
vi.mock("@opentelemetry/api", () => ({
context: {
@@ -99,6 +114,10 @@ vi.mock("@opentelemetry/exporter-logs-otlp-proto", () => ({
},
}));
vi.mock("openclaw/plugin-sdk/runtime-env", () => ({
registerUnhandledRejectionHandler: unhandledRejectionHandlerState.register,
}));
vi.mock("@opentelemetry/sdk-logs", () => ({
BatchLogRecordProcessor: function BatchLogRecordProcessor() {},
LoggerProvider: class {
@@ -336,6 +355,8 @@ describe("diagnostics-otel service", () => {
traceExporterCtor.mockClear();
metricExporterCtor.mockClear();
logExporterCtor.mockClear();
unhandledRejectionHandlerState.reset();
unhandledRejectionHandlerState.register.mockClear();
delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
@@ -531,6 +552,54 @@ describe("diagnostics-otel service", () => {
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
});
test("registers and removes an OTLP exporter unhandled rejection handler", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
await service.start(ctx);
expect(unhandledRejectionHandlerState.register).toHaveBeenCalledTimes(1);
const handler = unhandledRejectionHandlerState.getHandlers()[0];
expect(handler).toBeTypeOf("function");
const errorInstance = Object.assign(new Error("collector gone"), {
name: "OTLPExporterError",
code: 410,
});
expect(handler?.(errorInstance)).toBe(true);
expect(handler?.({ name: "OTLPExporterError", code: 410, data: "user_stop" })).toBe(true);
expect(handler?.([{ name: "OTLPExporterError", code: 410, data: "user_stop" }])).toBe(true);
expect(
handler?.(
new AggregateError(
[{ name: "OTLPExporterError", code: 410, data: "user_stop" }],
"export failed",
),
),
).toBe(true);
expect(handler?.(new Error("other exporter error"))).toBe(false);
expect(ctx.logger.warn).toHaveBeenCalledWith(
"diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=410)",
);
await service.stop?.(ctx);
expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0);
});
test("does not retain an OTLP exporter handler when startup setup fails", async () => {
const startupError = new Error("trace exporter setup failed");
traceExporterCtor.mockImplementationOnce(() => {
throw startupError;
});
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true });
await expect(service.start(ctx)).rejects.toBe(startupError);
expect(unhandledRejectionHandlerState.register).not.toHaveBeenCalled();
expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0);
});
test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => {
process.env.OPENCLAW_OTEL_PRELOADED = "1";
const service = createDiagnosticsOtelService();

View File

@@ -15,6 +15,7 @@ import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
import { NodeSDK } from "@opentelemetry/sdk-node";
import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base";
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env";
import type {
DiagnosticEventMetadata,
DiagnosticEventPayload,
@@ -169,6 +170,78 @@ function errorCategory(err: unknown): string {
}
}
function collectNestedErrorCandidates(err: unknown): unknown[] {
const queue: unknown[] = [err];
const seen = new Set<unknown>();
const candidates: unknown[] = [];
while (queue.length > 0) {
const current = queue.shift();
if (current == null || seen.has(current)) {
continue;
}
seen.add(current);
candidates.push(current);
if (Array.isArray(current)) {
for (const item of current) {
if (item != null && !seen.has(item)) {
queue.push(item);
}
}
continue;
}
if (typeof current !== "object") {
continue;
}
const record = current as Record<string, unknown>;
for (const nested of [record.cause, record.reason, record.original, record.error]) {
if (nested != null && !seen.has(nested)) {
queue.push(nested);
}
}
if (Array.isArray(record.errors)) {
for (const nested of record.errors) {
if (nested != null && !seen.has(nested)) {
queue.push(nested);
}
}
}
}
return candidates;
}
function readErrorName(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const name = (err as { name?: unknown }).name;
return typeof name === "string" && name.trim() ? name : undefined;
}
function readErrorCode(err: unknown): string | number | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const code = (err as { code?: unknown }).code;
return typeof code === "string" || typeof code === "number" ? code : undefined;
}
function findOtlpExporterError(reason: unknown): object | undefined {
for (const candidate of collectNestedErrorCandidates(reason)) {
if (
readErrorName(candidate) === "OTLPExporterError" &&
candidate &&
typeof candidate === "object"
) {
return candidate;
}
}
return undefined;
}
function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
const redactedAttributes: Record<string, string | number | boolean> = {};
for (const [key, value] of Object.entries(attributes)) {
@@ -524,18 +597,22 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
let logProvider: LoggerProvider | null = null;
let unsubscribe: (() => void) | null = null;
let stopActiveTrustedSpans: (() => void) | null = null;
let unregisterUnhandledRejectionHandler: (() => void) | null = null;
const stopStarted = async () => {
const currentUnsubscribe = unsubscribe;
const currentLogProvider = logProvider;
const currentSdk = sdk;
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
const currentUnregisterUnhandledRejectionHandler = unregisterUnhandledRejectionHandler;
unsubscribe = null;
logProvider = null;
sdk = null;
stopActiveTrustedSpans = null;
unregisterUnhandledRejectionHandler = null;
currentUnregisterUnhandledRejectionHandler?.();
currentUnsubscribe?.();
currentStopActiveTrustedSpans?.();
if (currentLogProvider) {
@@ -2471,6 +2548,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
}
});
unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((reason) => {
const otlpError = findOtlpExporterError(reason);
if (!otlpError) {
return false;
}
const code = readErrorCode(otlpError) ?? "unknown";
ctx.logger.warn(
`diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=${String(code)})`,
);
return true;
});
emitForSignals(enabledSignals, {
exporter: "diagnostics-otel",
status: "started",