mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-25 07:33:04 +00:00
fix(diagnostics-otel): suppress exporter rejection crashes (#84881)
Summary: - The PR adds a diagnostics-otel scoped unhandled-rejection handler for nested OTLPExporterError values, unregisters it on stop/restart, adds regression tests, and adds a changelog entry. - Reproducibility: yes. The source path is high-confidence: current main has no OTLPExporterError-specific dia ... ror for non-retryable OTLP HTTP failures; I did not run a live collector shutdown in this read-only review. Automerge notes: - PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): avoid stale exporter handler - PR branch already contained follow-up commit before automerge: fix(diagnostics-otel): suppress exporter rejection crashes Validation: - ClawSweeper review passed for heade19c06c992. - Required merge gates passed before the squash merge. Prepared head SHA:e19c06c992Review: https://github.com/openclaw/openclaw/pull/84881#issuecomment-4506249586 Co-authored-by: luoyanglang <hanwanlonga@gmail.com> Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com> Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com> Approved-by: takhoffman Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
|
||||
- doctor: constrain legacy plugin cleanup paths [AI]. (#84801) Thanks @pgondhi987.
|
||||
- Update/doctor: prune stale local bundled plugin install records that point at old compiled bundled output so current bundled plugin schemas win after upgrade. (#84863) Thanks @fuller-stack-dev.
|
||||
- PDF tool: time out idle remote PDF body reads after 120 seconds so stalled remote documents return an error instead of wedging the session. Fixes #68649. (#84768) Thanks @luoyanglang.
|
||||
- Diagnostics/OpenTelemetry plugin: suppress handled OTLP exporter promise rejections so collector shutdowns no longer crash the Gateway. (#81085) Thanks @luoyanglang.
|
||||
- Media/audio: skip empty structured sherpa-onnx transcripts instead of treating the raw JSON payload as spoken text. (#84667) Thanks @TurboTheTurtle.
|
||||
- Node/Linux: keep `OPENCLAW_GATEWAY_TOKEN` out of generated systemd unit files by writing node service token values to a node-specific env file. (#84408)
|
||||
- Memory-core/dreaming: reuse stable narrative subagent session keys per workspace and phase while keeping per-run idempotency and bounded cleanup, so stale `dreaming-narrative-*` sessions do not accumulate. Fixes #68252, #69187, and #70402. (#70464) Thanks @chiyouYCH.
|
||||
|
||||
@@ -53,6 +53,21 @@ const logShutdown = vi.hoisted(() => vi.fn().mockResolvedValue(undefined));
|
||||
const traceExporterCtor = vi.hoisted(() => vi.fn());
|
||||
const metricExporterCtor = vi.hoisted(() => vi.fn());
|
||||
const logExporterCtor = vi.hoisted(() => vi.fn());
|
||||
const unhandledRejectionHandlerState = vi.hoisted(() => {
|
||||
let handlers: Array<(reason: unknown) => boolean> = [];
|
||||
return {
|
||||
getHandlers: () => handlers,
|
||||
register: vi.fn((handler: (reason: unknown) => boolean) => {
|
||||
handlers.push(handler);
|
||||
return () => {
|
||||
handlers = handlers.filter((candidate) => candidate !== handler);
|
||||
};
|
||||
}),
|
||||
reset: () => {
|
||||
handlers = [];
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("@opentelemetry/api", () => ({
|
||||
context: {
|
||||
@@ -99,6 +114,10 @@ vi.mock("@opentelemetry/exporter-logs-otlp-proto", () => ({
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/runtime-env", () => ({
|
||||
registerUnhandledRejectionHandler: unhandledRejectionHandlerState.register,
|
||||
}));
|
||||
|
||||
vi.mock("@opentelemetry/sdk-logs", () => ({
|
||||
BatchLogRecordProcessor: function BatchLogRecordProcessor() {},
|
||||
LoggerProvider: class {
|
||||
@@ -336,6 +355,8 @@ describe("diagnostics-otel service", () => {
|
||||
traceExporterCtor.mockClear();
|
||||
metricExporterCtor.mockClear();
|
||||
logExporterCtor.mockClear();
|
||||
unhandledRejectionHandlerState.reset();
|
||||
unhandledRejectionHandlerState.register.mockClear();
|
||||
delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT;
|
||||
delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT;
|
||||
delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT;
|
||||
@@ -531,6 +552,54 @@ describe("diagnostics-otel service", () => {
|
||||
expect(telemetryState.tracer.startSpan).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test("registers and removes an OTLP exporter unhandled rejection handler", async () => {
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true, metrics: true, logs: true });
|
||||
|
||||
await service.start(ctx);
|
||||
|
||||
expect(unhandledRejectionHandlerState.register).toHaveBeenCalledTimes(1);
|
||||
const handler = unhandledRejectionHandlerState.getHandlers()[0];
|
||||
expect(handler).toBeTypeOf("function");
|
||||
|
||||
const errorInstance = Object.assign(new Error("collector gone"), {
|
||||
name: "OTLPExporterError",
|
||||
code: 410,
|
||||
});
|
||||
expect(handler?.(errorInstance)).toBe(true);
|
||||
expect(handler?.({ name: "OTLPExporterError", code: 410, data: "user_stop" })).toBe(true);
|
||||
expect(handler?.([{ name: "OTLPExporterError", code: 410, data: "user_stop" }])).toBe(true);
|
||||
expect(
|
||||
handler?.(
|
||||
new AggregateError(
|
||||
[{ name: "OTLPExporterError", code: 410, data: "user_stop" }],
|
||||
"export failed",
|
||||
),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(handler?.(new Error("other exporter error"))).toBe(false);
|
||||
expect(ctx.logger.warn).toHaveBeenCalledWith(
|
||||
"diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=410)",
|
||||
);
|
||||
|
||||
await service.stop?.(ctx);
|
||||
expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("does not retain an OTLP exporter handler when startup setup fails", async () => {
|
||||
const startupError = new Error("trace exporter setup failed");
|
||||
traceExporterCtor.mockImplementationOnce(() => {
|
||||
throw startupError;
|
||||
});
|
||||
const service = createDiagnosticsOtelService();
|
||||
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { traces: true });
|
||||
|
||||
await expect(service.start(ctx)).rejects.toBe(startupError);
|
||||
|
||||
expect(unhandledRejectionHandlerState.register).not.toHaveBeenCalled();
|
||||
expect(unhandledRejectionHandlerState.getHandlers()).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("uses a preloaded OpenTelemetry SDK without dropping diagnostic listeners", async () => {
|
||||
process.env.OPENCLAW_OTEL_PRELOADED = "1";
|
||||
const service = createDiagnosticsOtelService();
|
||||
|
||||
@@ -15,6 +15,7 @@ import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
|
||||
import { NodeSDK } from "@opentelemetry/sdk-node";
|
||||
import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base";
|
||||
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
|
||||
import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env";
|
||||
import type {
|
||||
DiagnosticEventMetadata,
|
||||
DiagnosticEventPayload,
|
||||
@@ -169,6 +170,78 @@ function errorCategory(err: unknown): string {
|
||||
}
|
||||
}
|
||||
|
||||
function collectNestedErrorCandidates(err: unknown): unknown[] {
|
||||
const queue: unknown[] = [err];
|
||||
const seen = new Set<unknown>();
|
||||
const candidates: unknown[] = [];
|
||||
|
||||
while (queue.length > 0) {
|
||||
const current = queue.shift();
|
||||
if (current == null || seen.has(current)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(current);
|
||||
candidates.push(current);
|
||||
|
||||
if (Array.isArray(current)) {
|
||||
for (const item of current) {
|
||||
if (item != null && !seen.has(item)) {
|
||||
queue.push(item);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (typeof current !== "object") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const record = current as Record<string, unknown>;
|
||||
for (const nested of [record.cause, record.reason, record.original, record.error]) {
|
||||
if (nested != null && !seen.has(nested)) {
|
||||
queue.push(nested);
|
||||
}
|
||||
}
|
||||
if (Array.isArray(record.errors)) {
|
||||
for (const nested of record.errors) {
|
||||
if (nested != null && !seen.has(nested)) {
|
||||
queue.push(nested);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function readErrorName(err: unknown): string | undefined {
|
||||
if (!err || typeof err !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const name = (err as { name?: unknown }).name;
|
||||
return typeof name === "string" && name.trim() ? name : undefined;
|
||||
}
|
||||
|
||||
function readErrorCode(err: unknown): string | number | undefined {
|
||||
if (!err || typeof err !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const code = (err as { code?: unknown }).code;
|
||||
return typeof code === "string" || typeof code === "number" ? code : undefined;
|
||||
}
|
||||
|
||||
function findOtlpExporterError(reason: unknown): object | undefined {
|
||||
for (const candidate of collectNestedErrorCandidates(reason)) {
|
||||
if (
|
||||
readErrorName(candidate) === "OTLPExporterError" &&
|
||||
candidate &&
|
||||
typeof candidate === "object"
|
||||
) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
|
||||
const redactedAttributes: Record<string, string | number | boolean> = {};
|
||||
for (const [key, value] of Object.entries(attributes)) {
|
||||
@@ -524,18 +597,22 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
let logProvider: LoggerProvider | null = null;
|
||||
let unsubscribe: (() => void) | null = null;
|
||||
let stopActiveTrustedSpans: (() => void) | null = null;
|
||||
let unregisterUnhandledRejectionHandler: (() => void) | null = null;
|
||||
|
||||
const stopStarted = async () => {
|
||||
const currentUnsubscribe = unsubscribe;
|
||||
const currentLogProvider = logProvider;
|
||||
const currentSdk = sdk;
|
||||
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
|
||||
const currentUnregisterUnhandledRejectionHandler = unregisterUnhandledRejectionHandler;
|
||||
|
||||
unsubscribe = null;
|
||||
logProvider = null;
|
||||
sdk = null;
|
||||
stopActiveTrustedSpans = null;
|
||||
unregisterUnhandledRejectionHandler = null;
|
||||
|
||||
currentUnregisterUnhandledRejectionHandler?.();
|
||||
currentUnsubscribe?.();
|
||||
currentStopActiveTrustedSpans?.();
|
||||
if (currentLogProvider) {
|
||||
@@ -2471,6 +2548,18 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
||||
}
|
||||
});
|
||||
|
||||
unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((reason) => {
|
||||
const otlpError = findOtlpExporterError(reason);
|
||||
if (!otlpError) {
|
||||
return false;
|
||||
}
|
||||
const code = readErrorCode(otlpError) ?? "unknown";
|
||||
ctx.logger.warn(
|
||||
`diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=${String(code)})`,
|
||||
);
|
||||
return true;
|
||||
});
|
||||
|
||||
emitForSignals(enabledSignals, {
|
||||
exporter: "diagnostics-otel",
|
||||
status: "started",
|
||||
|
||||
Reference in New Issue
Block a user