Files
openclaw/extensions/diagnostics-otel/src/service.ts
Vincent Koc c4b9f54b46 fix(diagnostics): flush OTel trace batches
Apply diagnostics.otel.flushIntervalMs to OpenTelemetry trace batching so short-lived Windows and QA runs do not lose late lifecycle/model spans. Also make the OTel QA smoke wait for required telemetry and print bounded failure diagnostics.
2026-05-26 22:46:39 +01:00

3247 lines
113 KiB
TypeScript

import {
context as otelContextApi,
metrics,
trace,
SpanKind,
SpanStatusCode,
TraceFlags,
} from "@opentelemetry/api";
import type { LogRecord, SeverityNumber } from "@opentelemetry/api-logs";
import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-proto";
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-proto";
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
import { resourceFromAttributes } from "@opentelemetry/resources";
import { BatchLogRecordProcessor, LoggerProvider } from "@opentelemetry/sdk-logs";
import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
import { NodeSDK } from "@opentelemetry/sdk-node";
import {
BatchSpanProcessor,
ParentBasedSampler,
TraceIdRatioBasedSampler,
} from "@opentelemetry/sdk-trace-base";
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
import {
ATTR_GEN_AI_INPUT_MESSAGES,
ATTR_GEN_AI_OUTPUT_MESSAGES,
ATTR_GEN_AI_SYSTEM_INSTRUCTIONS,
ATTR_GEN_AI_TOOL_DEFINITIONS,
} from "@opentelemetry/semantic-conventions/incubating";
import { registerUnhandledRejectionHandler } from "openclaw/plugin-sdk/runtime-env";
import type {
DiagnosticEventMetadata,
DiagnosticEventPayload,
DiagnosticTraceContext,
OpenClawPluginService,
} from "../api.js";
import {
isValidDiagnosticSpanId,
isValidDiagnosticTraceFlags,
isValidDiagnosticTraceId,
redactSensitiveText,
} from "../api.js";
const DEFAULT_SERVICE_NAME = "openclaw";
const DROPPED_OTEL_ATTRIBUTE_KEYS = new Set([
"openclaw.callId",
"openclaw.call_id",
"openclaw.chatId",
"openclaw.chat_id",
"openclaw.messageId",
"openclaw.message_id",
"openclaw.parentSpanId",
"openclaw.parent_span_id",
"openclaw.runId",
"openclaw.run_id",
"openclaw.sessionId",
"openclaw.session_id",
"openclaw.sessionKey",
"openclaw.session_key",
"openclaw.spanId",
"openclaw.span_id",
"openclaw.toolCallId",
"openclaw.tool_call_id",
"openclaw.traceId",
"openclaw.trace_id",
]);
const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u;
const MAX_OTEL_CONTENT_ATTRIBUTE_CHARS = 128 * 1024;
const MAX_OTEL_CONTENT_ARRAY_ITEMS = 200;
const MAX_OTEL_LOG_BODY_CHARS = 4 * 1024;
const MAX_OTEL_LOG_ATTRIBUTE_COUNT = 64;
const MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS = 4 * 1024;
const LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS = 60_000;
const OTEL_LOG_RAW_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,64}$/u;
const OTEL_LOG_ATTRIBUTE_KEY_RE = /^[A-Za-z0-9_.:-]{1,96}$/u;
const BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS = new Set(["__proto__", "prototype", "constructor"]);
const PRELOADED_OTEL_SDK_ENV = "OPENCLAW_OTEL_PRELOADED";
const OTEL_EXPORTER_OTLP_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_ENDPOINT";
const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT";
const OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT";
const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT";
const OTEL_SEMCONV_STABILITY_OPT_IN_ENV = "OTEL_SEMCONV_STABILITY_OPT_IN";
const GEN_AI_LATEST_EXPERIMENTAL_OPT_IN = "gen_ai_latest_experimental";
const GEN_AI_TOKEN_USAGE_BUCKETS = [
1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864,
];
const GEN_AI_OPERATION_DURATION_BUCKETS = [
0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92,
];
type OtelContentCapturePolicy = {
inputMessages: boolean;
outputMessages: boolean;
toolInputs: boolean;
toolOutputs: boolean;
systemPrompt: boolean;
toolDefinitions: boolean;
logBodies: boolean;
};
type OtelModelCallContent = {
inputMessages?: unknown;
outputMessages?: unknown;
systemPrompt?: string;
toolDefinitions?: unknown;
};
type MessageDeliveryDiagnosticEvent = Extract<
DiagnosticEventPayload,
{
type: "message.delivery.started" | "message.delivery.completed" | "message.delivery.error";
}
>;
type ModelCallLifecycleDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "model.call.completed" | "model.call.error" }
>;
type ModelFailoverDiagnosticEvent = Extract<DiagnosticEventPayload, { type: "model.failover" }>;
type HarnessRunDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "harness.run.started" | "harness.run.completed" | "harness.run.error" }
>;
type TelemetryExporterDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "telemetry.exporter" }
>;
type SessionRecoveryDiagnosticEvent = Extract<
DiagnosticEventPayload,
{ type: "session.recovery.requested" | "session.recovery.completed" }
>;
type TalkDiagnosticEvent = Extract<DiagnosticEventPayload, { type: "talk.event" }>;
const NO_CONTENT_CAPTURE: OtelContentCapturePolicy = {
inputMessages: false,
outputMessages: false,
toolInputs: false,
toolOutputs: false,
systemPrompt: false,
toolDefinitions: false,
logBodies: false,
};
function normalizeEndpoint(endpoint?: string): string | undefined {
const trimmed = endpoint?.trim();
return trimmed ? trimmed.replace(/\/+$/, "") : undefined;
}
function resolveOtelUrl(endpoint: string | undefined, path: string): string | undefined {
if (!endpoint) {
return undefined;
}
const endpointWithoutQueryOrFragment = endpoint.split(/[?#]/, 1)[0] ?? endpoint;
if (/\/v1\/(?:traces|metrics|logs)$/i.test(endpointWithoutQueryOrFragment)) {
return endpoint;
}
if (/[?#]/u.test(endpoint)) {
try {
const url = new URL(endpoint);
const basePath = url.pathname.replace(/\/+$/u, "");
url.pathname = `${basePath}/${path}`;
return url.toString();
} catch {
// Fall back to the historical concatenation path for non-URL test doubles.
}
}
return `${endpoint}/${path}`;
}
function resolveSignalOtelUrl(params: {
signalEndpoint?: string;
signalEnvEndpoint?: string;
endpoint?: string;
path: string;
}): string | undefined {
return resolveOtelUrl(
normalizeEndpoint(params.signalEndpoint ?? params.signalEnvEndpoint) ?? params.endpoint,
params.path,
);
}
function resolveSampleRate(value: number | undefined): number | undefined {
if (typeof value !== "number" || !Number.isFinite(value)) {
return undefined;
}
if (value < 0 || value > 1) {
return undefined;
}
return value;
}
function formatError(err: unknown): string {
if (err instanceof Error) {
return err.stack ?? err.message;
}
if (typeof err === "string") {
return err;
}
try {
return JSON.stringify(err);
} catch {
return String(err);
}
}
function errorCategory(err: unknown): string {
try {
if (err instanceof Error && typeof err.name === "string" && err.name.trim()) {
return lowCardinalityAttr(err.name, "Error");
}
return lowCardinalityAttr(typeof err, "unknown");
} catch {
return "unknown";
}
}
function collectNestedErrorCandidates(err: unknown): unknown[] {
const queue: unknown[] = [err];
const seen = new Set<unknown>();
const candidates: unknown[] = [];
while (queue.length > 0) {
const current = queue.shift();
if (current == null || seen.has(current)) {
continue;
}
seen.add(current);
candidates.push(current);
if (Array.isArray(current)) {
for (const item of current) {
if (item != null && !seen.has(item)) {
queue.push(item);
}
}
continue;
}
if (typeof current !== "object") {
continue;
}
const record = current as Record<string, unknown>;
for (const nested of [record.cause, record.reason, record.original, record.error]) {
if (nested != null && !seen.has(nested)) {
queue.push(nested);
}
}
if (Array.isArray(record.errors)) {
for (const nested of record.errors) {
if (nested != null && !seen.has(nested)) {
queue.push(nested);
}
}
}
}
return candidates;
}
function readErrorName(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const name = (err as { name?: unknown }).name;
return typeof name === "string" && name.trim() ? name : undefined;
}
function readErrorCode(err: unknown): string | number | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const code = (err as { code?: unknown }).code;
return typeof code === "string" || typeof code === "number" ? code : undefined;
}
function findOtlpExporterError(reason: unknown): object | undefined {
for (const candidate of collectNestedErrorCandidates(reason)) {
if (
readErrorName(candidate) === "OTLPExporterError" &&
candidate &&
typeof candidate === "object"
) {
return candidate;
}
}
return undefined;
}
function redactOtelAttributes(attributes: Record<string, string | number | boolean>) {
const redactedAttributes: Record<string, string | number | boolean> = {};
for (const [key, value] of Object.entries(attributes)) {
if (DROPPED_OTEL_ATTRIBUTE_KEYS.has(key)) {
continue;
}
redactedAttributes[key] = typeof value === "string" ? redactSensitiveText(value) : value;
}
return redactedAttributes;
}
function lowCardinalityAttr(value: string | undefined, fallback = "unknown"): string {
if (!value) {
return fallback;
}
const redacted = redactSensitiveText(value.trim());
const redactedLower = redacted.toLowerCase();
if (redactedLower.startsWith("agent:") || redactedLower.includes(":agent:")) {
return fallback;
}
return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback;
}
function lowCardinalityQueueLaneAttr(value: string | undefined, fallback = "unknown"): string {
if (!value) {
return fallback;
}
const redacted = redactSensitiveText(value.trim());
const redactedLower = redacted.toLowerCase();
if (redactedLower.startsWith("agent:")) {
return fallback;
}
const scopedLaneIndex = redacted.indexOf(":");
const lane = scopedLaneIndex >= 0 ? redacted.slice(0, scopedLaneIndex) : redacted;
return LOW_CARDINALITY_VALUE_RE.test(lane) ? lane : fallback;
}
function shouldCaptureOtelLogBody(policy: OtelContentCapturePolicy): boolean {
return policy.logBodies;
}
function hasOtelSemconvOptIn(value: string | undefined, optIn: string): boolean {
return (
value
?.split(",")
.map((part) => part.trim())
.includes(optIn) ?? false
);
}
function emitLatestGenAiSemconv(): boolean {
return hasOtelSemconvOptIn(
process.env[OTEL_SEMCONV_STABILITY_OPT_IN_ENV],
GEN_AI_LATEST_EXPERIMENTAL_OPT_IN,
);
}
function genAiOperationName(
api: string | undefined,
): "chat" | "generate_content" | "text_completion" {
const normalized = api?.trim().toLowerCase();
if (!normalized) {
return "chat";
}
if (normalized === "completions" || normalized.endsWith("-completions")) {
return "text_completion";
}
if (normalized === "generate_content" || normalized.includes("generative-ai")) {
return "generate_content";
}
return "chat";
}
function positiveFiniteNumber(value: number | undefined): number | undefined {
return typeof value === "number" && Number.isFinite(value) && value > 0 ? value : undefined;
}
function assignPositiveNumberAttr(
attrs: Record<string, string | number | boolean>,
key: string,
value: number | undefined,
): void {
const normalized = positiveFiniteNumber(value);
if (normalized !== undefined) {
attrs[key] = normalized;
}
}
function assignModelCallSizeTimingAttrs(
attrs: Record<string, string | number | boolean>,
evt: {
requestPayloadBytes?: number;
responseStreamBytes?: number;
timeToFirstByteMs?: number;
},
): void {
assignPositiveNumberAttr(attrs, "openclaw.model_call.request_bytes", evt.requestPayloadBytes);
assignPositiveNumberAttr(attrs, "openclaw.model_call.response_bytes", evt.responseStreamBytes);
assignPositiveNumberAttr(
attrs,
"openclaw.model_call.time_to_first_byte_ms",
evt.timeToFirstByteMs,
);
}
function assignGenAiSpanIdentityAttrs(
attrs: Record<string, string | number | boolean>,
input: { api?: string; model?: string; provider?: string },
): void {
if (emitLatestGenAiSemconv()) {
attrs["gen_ai.provider.name"] = lowCardinalityAttr(input.provider);
} else {
attrs["gen_ai.system"] = lowCardinalityAttr(input.provider);
}
if (input.model) {
attrs["gen_ai.request.model"] = lowCardinalityAttr(input.model);
}
attrs["gen_ai.operation.name"] = genAiOperationName(input.api);
}
function assignGenAiModelCallAttrs(
attrs: Record<string, string | number | boolean>,
evt: { api?: string; model?: string; provider?: string },
): void {
assignGenAiSpanIdentityAttrs(attrs, evt);
}
function modelCallSpanName(evt: { api?: string; model?: string }): string {
if (!emitLatestGenAiSemconv()) {
return "openclaw.model.call";
}
return `${genAiOperationName(evt.api)} ${lowCardinalityAttr(evt.model)}`;
}
function modelCallSpanKind(): SpanKind | undefined {
return emitLatestGenAiSemconv() ? SpanKind.CLIENT : undefined;
}
function addUpstreamRequestIdSpanEvent(
span: { addEvent?: (name: string, attributes?: Record<string, string>) => void },
upstreamRequestIdHash: string | undefined,
): void {
if (!upstreamRequestIdHash) {
return;
}
const boundedHash = lowCardinalityAttr(upstreamRequestIdHash);
if (boundedHash === "unknown") {
return;
}
span.addEvent?.("openclaw.provider.request", {
"openclaw.upstreamRequestIdHash": boundedHash,
});
}
function clampOtelLogText(value: string, maxChars: number): string {
return value.length > maxChars ? `${value.slice(0, maxChars)}...(truncated)` : value;
}
function normalizeOtelLogString(value: string, maxChars: number): string {
return clampOtelLogText(redactSensitiveText(value), maxChars);
}
function resolveContentCapturePolicy(value: unknown): OtelContentCapturePolicy {
if (value === true) {
return {
inputMessages: true,
outputMessages: true,
toolInputs: true,
toolOutputs: true,
systemPrompt: false,
toolDefinitions: true,
logBodies: true,
};
}
if (!value || typeof value !== "object" || Array.isArray(value)) {
return NO_CONTENT_CAPTURE;
}
const config = value as Record<string, unknown>;
if (config.enabled !== true) {
return NO_CONTENT_CAPTURE;
}
return {
inputMessages: config.inputMessages === true,
outputMessages: config.outputMessages === true,
toolInputs: config.toolInputs === true,
toolOutputs: config.toolOutputs === true,
systemPrompt: config.systemPrompt === true,
toolDefinitions: config.toolDefinitions === true,
logBodies: false,
};
}
function hasPreloadedOtelSdk(): boolean {
return process.env[PRELOADED_OTEL_SDK_ENV] === "1";
}
function normalizeOtelContentValue(value: unknown): string | undefined {
if (typeof value === "string") {
return normalizeOtelLogString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
}
if (Array.isArray(value)) {
const items: string[] = [];
for (const item of value.slice(0, MAX_OTEL_CONTENT_ARRAY_ITEMS)) {
if (typeof item === "string") {
items.push(item);
}
}
if (items.length > 0) {
return normalizeOtelLogString(items.join("\n"), MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
}
}
const json = safeJsonString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
if (json) {
return json;
}
return undefined;
}
const TRUNCATED_JSON_TEXT_SUFFIX = "...(truncated)";
const JSON_TRUNCATION_STRING_BUDGETS = [8192, 4096, 2048, 1024, 512, 256, 128, 64, 32] as const;
const JSON_TRUNCATION_ARRAY_ITEM_BUDGETS = [
MAX_OTEL_CONTENT_ARRAY_ITEMS,
100,
50,
25,
10,
5,
1,
] as const;
const JSON_TRUNCATION_MAX_OBJECT_FIELDS = 64;
const JSON_TRUNCATION_MAX_DEPTH = 8;
type JsonTruncationOptions = {
maxArrayItems: number;
maxDepth: number;
maxObjectFields: number;
maxStringChars: number;
seen: WeakSet<object>;
};
function safeJsonString(value: unknown, maxChars: number): string | undefined {
if (value === undefined || typeof value === "function" || typeof value === "symbol") {
return undefined;
}
const exact = stringifyJsonForOtelAttribute(value);
if (exact && exact.length <= maxChars) {
return exact;
}
for (const maxArrayItems of JSON_TRUNCATION_ARRAY_ITEM_BUDGETS) {
for (const maxStringChars of JSON_TRUNCATION_STRING_BUDGETS) {
const candidate = truncateJsonValueForOtelAttribute(value, {
maxArrayItems,
maxDepth: JSON_TRUNCATION_MAX_DEPTH,
maxObjectFields: JSON_TRUNCATION_MAX_OBJECT_FIELDS,
maxStringChars,
seen: new WeakSet<object>(),
});
const json = stringifyJsonForOtelAttribute(candidate);
if (json && json.length <= maxChars) {
return json;
}
}
}
const summary = stringifyJsonForOtelAttribute({
truncated: true,
reason: exact ? "max_attribute_size" : "unserializable_value",
type: describeJsonValue(value),
});
return summary && summary.length <= maxChars ? summary : undefined;
}
function stringifyJsonForOtelAttribute(value: unknown): string | undefined {
try {
const json = JSON.stringify(value);
if (!json) {
return undefined;
}
return redactSensitiveText(json);
} catch {
return undefined;
}
}
function truncateJsonValueForOtelAttribute(
value: unknown,
options: JsonTruncationOptions,
): unknown {
if (typeof value === "string") {
return truncateJsonTextForOtelAttribute(value, options.maxStringChars);
}
if (typeof value === "number" || typeof value === "boolean" || value === null) {
return value;
}
if (typeof value === "bigint") {
return truncateJsonTextForOtelAttribute(String(value), options.maxStringChars);
}
if (value === undefined || typeof value === "function" || typeof value === "symbol") {
return undefined;
}
if (options.maxDepth <= 0) {
return { truncated: true, reason: "max_depth" };
}
if (Array.isArray(value)) {
return truncateJsonArrayForOtelAttribute(value, options);
}
if (typeof value === "object") {
return truncateJsonObjectForOtelAttribute(value as Record<string, unknown>, options);
}
return undefined;
}
function truncateJsonArrayForOtelAttribute(
value: readonly unknown[],
options: JsonTruncationOptions,
): unknown[] {
if (options.seen.has(value)) {
return [{ truncated: true, reason: "circular_reference" }];
}
options.seen.add(value);
const nextOptions = { ...options, maxDepth: options.maxDepth - 1 };
const items = value
.slice(0, options.maxArrayItems)
.map((item) => truncateJsonValueForOtelAttribute(item, nextOptions));
if (value.length > items.length) {
items.push({ truncated: true, omittedItems: value.length - items.length });
}
options.seen.delete(value);
return items;
}
function truncateJsonObjectForOtelAttribute(
value: Record<string, unknown>,
options: JsonTruncationOptions,
): Record<string, unknown> {
if (options.seen.has(value)) {
return { truncated: true, reason: "circular_reference" };
}
options.seen.add(value);
const nextOptions = { ...options, maxDepth: options.maxDepth - 1 };
const result: Record<string, unknown> = {};
const entries = Object.entries(value).filter(
([, field]) => field !== undefined && typeof field !== "function" && typeof field !== "symbol",
);
for (const [key, field] of entries.slice(0, options.maxObjectFields)) {
result[key] = truncateJsonValueForOtelAttribute(field, nextOptions);
}
if (entries.length > options.maxObjectFields) {
result.truncated = true;
result.omittedFields = entries.length - options.maxObjectFields;
}
options.seen.delete(value);
return result;
}
function truncateJsonTextForOtelAttribute(value: string, maxChars: number): string {
const redacted = redactSensitiveText(value);
if (redacted.length <= maxChars) {
return redacted;
}
const suffixBudget = Math.min(TRUNCATED_JSON_TEXT_SUFFIX.length, maxChars);
const prefixBudget = Math.max(0, maxChars - suffixBudget);
return `${redacted.slice(0, prefixBudget)}${TRUNCATED_JSON_TEXT_SUFFIX.slice(
TRUNCATED_JSON_TEXT_SUFFIX.length - suffixBudget,
)}`;
}
function describeJsonValue(value: unknown): string {
if (Array.isArray(value)) {
return "array";
}
if (value === null) {
return "null";
}
return typeof value;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
}
function textPart(content: string): Record<string, unknown> {
return { type: "text", content };
}
function toolCallResponsePart(part: Record<string, unknown>): Record<string, unknown> {
return {
type: "tool_call_response",
...(typeof part.id === "string" ? { id: part.id } : {}),
result: part.result ?? part.response ?? part.content ?? part.details ?? "",
};
}
function contentParts(value: unknown): Record<string, unknown>[] {
if (typeof value === "string") {
return value.length > 0 ? [textPart(value)] : [];
}
if (!Array.isArray(value)) {
if (value === undefined || value === null) {
return [];
}
if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") {
return [textPart(String(value))];
}
const json = safeJsonString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
return json ? [textPart(json)] : [];
}
const parts: Record<string, unknown>[] = [];
for (const part of value) {
if (typeof part === "string") {
if (part.length > 0) {
parts.push(textPart(part));
}
continue;
}
if (!isRecord(part)) {
continue;
}
if (part.type === "text" && typeof part.text === "string") {
parts.push(textPart(part.text));
} else if (part.type === "text" && typeof part.content === "string") {
parts.push(textPart(part.content));
} else if (part.type === "thinking" && typeof part.thinking === "string") {
parts.push({ type: "reasoning", content: part.thinking });
} else if (part.type === "toolCall" && typeof part.name === "string") {
parts.push({
type: "tool_call",
name: part.name,
...(typeof part.id === "string" ? { id: part.id } : {}),
...(part.arguments !== undefined ? { arguments: part.arguments } : {}),
});
} else if (part.type === "tool_call" && typeof part.name === "string") {
parts.push({
type: "tool_call",
name: part.name,
...(typeof part.id === "string" ? { id: part.id } : {}),
...(part.arguments !== undefined ? { arguments: part.arguments } : {}),
});
} else if (part.type === "tool_call_response") {
parts.push(toolCallResponsePart(part));
} else if (part.type === "image") {
const data = typeof part.data === "string" ? part.data : undefined;
parts.push({
type: "blob",
modality: "image",
...(typeof part.mimeType === "string" ? { mime_type: part.mimeType } : {}),
...(typeof part.mime_type === "string" ? { mime_type: part.mime_type } : {}),
...(data ? { content: data } : {}),
});
}
}
return parts;
}
function normalizeGenAiMessage(
value: unknown,
fallbackRole = "user",
): Record<string, unknown> | undefined {
if (typeof value === "string") {
return { role: fallbackRole, parts: [textPart(value)] };
}
if (!isRecord(value)) {
return undefined;
}
const rawRole = typeof value.role === "string" ? value.role : fallbackRole;
const role = rawRole === "toolResult" ? "tool" : rawRole;
let parts: Record<string, unknown>[];
if (role === "tool") {
const explicitParts = contentParts(value.parts);
parts =
explicitParts.length > 0
? explicitParts
: [
toolCallResponsePart({
id: value.toolCallId,
result: value.content ?? value.details ?? "",
}),
];
} else {
parts = contentParts(value.parts ?? value.content);
}
if (parts.length === 0) {
return undefined;
}
return {
role,
parts,
...(typeof value.name === "string" ? { name: value.name } : {}),
...(typeof value.finish_reason === "string" ? { finish_reason: value.finish_reason } : {}),
...(typeof value.stopReason === "string" ? { finish_reason: value.stopReason } : {}),
};
}
function normalizeGenAiMessages(value: unknown, fallbackRole: "user" | "assistant") {
const source = Array.isArray(value) ? value : value === undefined ? [] : [value];
const messages: Record<string, unknown>[] = [];
for (const item of source.slice(0, MAX_OTEL_CONTENT_ARRAY_ITEMS)) {
const message = normalizeGenAiMessage(item, fallbackRole);
if (message) {
messages.push(message);
}
}
return messages;
}
function normalizeGenAiToolDefinition(value: unknown): Record<string, unknown> | undefined {
if (!isRecord(value) || typeof value.name !== "string" || value.name.trim().length === 0) {
return undefined;
}
return {
type: typeof value.type === "string" ? value.type : "function",
name: value.name,
...(typeof value.description === "string" ? { description: value.description } : {}),
...(value.parameters !== undefined ? { parameters: value.parameters } : {}),
};
}
function normalizeGenAiToolDefinitions(value: unknown) {
if (!Array.isArray(value)) {
return [];
}
const definitions: Record<string, unknown>[] = [];
for (const item of value.slice(0, MAX_OTEL_CONTENT_ARRAY_ITEMS)) {
const definition = normalizeGenAiToolDefinition(item);
if (definition) {
definitions.push(definition);
}
}
return definitions;
}
function assignJsonAttribute(
attributes: Record<string, string | number | boolean>,
key: string,
value: unknown,
): void {
const json = safeJsonString(value, MAX_OTEL_CONTENT_ATTRIBUTE_CHARS);
if (json) {
attributes[key] = json;
}
}
function assignGenAiModelContentAttributes(
attributes: Record<string, string | number | boolean>,
content: OtelModelCallContent | undefined,
policy: OtelContentCapturePolicy,
): void {
if (policy.systemPrompt && typeof content?.systemPrompt === "string") {
const systemInstructions = [textPart(content.systemPrompt)];
assignJsonAttribute(attributes, ATTR_GEN_AI_SYSTEM_INSTRUCTIONS, systemInstructions);
}
if (policy.inputMessages) {
const inputMessages = normalizeGenAiMessages(content?.inputMessages, "user");
if (inputMessages.length > 0) {
assignJsonAttribute(attributes, ATTR_GEN_AI_INPUT_MESSAGES, inputMessages);
assignJsonAttribute(attributes, "input.value", inputMessages);
attributes["input.mime_type"] = "application/json";
}
}
if (policy.toolDefinitions) {
const toolDefinitions = normalizeGenAiToolDefinitions(content?.toolDefinitions);
if (toolDefinitions.length > 0) {
assignJsonAttribute(attributes, ATTR_GEN_AI_TOOL_DEFINITIONS, toolDefinitions);
}
}
if (policy.outputMessages) {
const outputMessages = normalizeGenAiMessages(content?.outputMessages, "assistant");
if (outputMessages.length > 0) {
assignJsonAttribute(attributes, ATTR_GEN_AI_OUTPUT_MESSAGES, outputMessages);
assignJsonAttribute(attributes, "output.value", outputMessages);
attributes["output.mime_type"] = "application/json";
}
}
}
function assignOtelContentAttribute(
attributes: Record<string, string | number | boolean>,
key: string,
value: unknown,
): void {
const normalized = normalizeOtelContentValue(value);
if (normalized) {
attributes[key] = normalized;
}
}
function assignOtelModelContentAttributes(
attributes: Record<string, string | number | boolean>,
content: OtelModelCallContent | undefined,
policy: OtelContentCapturePolicy,
): void {
assignGenAiModelContentAttributes(attributes, content, policy);
if (policy.inputMessages) {
assignOtelContentAttribute(
attributes,
"openclaw.content.input_messages",
content?.inputMessages,
);
}
if (policy.toolDefinitions) {
assignOtelContentAttribute(
attributes,
"openclaw.content.tool_definitions",
content?.toolDefinitions,
);
}
if (policy.outputMessages) {
assignOtelContentAttribute(
attributes,
"openclaw.content.output_messages",
content?.outputMessages,
);
}
if (policy.systemPrompt) {
assignOtelContentAttribute(
attributes,
"openclaw.content.system_prompt",
content?.systemPrompt,
);
}
}
function assignOtelToolContentAttributes(
attributes: Record<string, string | number | boolean>,
event: Record<string, unknown>,
policy: OtelContentCapturePolicy,
): void {
if (policy.toolInputs) {
assignOtelContentAttribute(attributes, "openclaw.content.tool_input", event.toolInput);
}
if (policy.toolOutputs) {
assignOtelContentAttribute(attributes, "openclaw.content.tool_output", event.toolOutput);
}
}
function assignOtelLogAttribute(
attributes: Record<string, string | number | boolean>,
key: string,
value: string | number | boolean,
): void {
if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
return;
}
if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
return;
}
if (redactSensitiveText(key) !== key) {
return;
}
if (!OTEL_LOG_ATTRIBUTE_KEY_RE.test(key)) {
return;
}
if (typeof value === "string") {
attributes[key] = normalizeOtelLogString(value, MAX_OTEL_LOG_ATTRIBUTE_VALUE_CHARS);
return;
}
if (typeof value === "number" && Number.isFinite(value)) {
attributes[key] = value;
return;
}
if (typeof value === "boolean") {
attributes[key] = value;
}
}
function normalizeTraceContext(value: unknown): DiagnosticTraceContext | undefined {
if (!value || typeof value !== "object" || Array.isArray(value)) {
return undefined;
}
const candidate = value as Partial<DiagnosticTraceContext>;
if (!isValidDiagnosticTraceId(candidate.traceId)) {
return undefined;
}
if (candidate.spanId !== undefined && !isValidDiagnosticSpanId(candidate.spanId)) {
return undefined;
}
if (candidate.parentSpanId !== undefined && !isValidDiagnosticSpanId(candidate.parentSpanId)) {
return undefined;
}
if (candidate.traceFlags !== undefined && !isValidDiagnosticTraceFlags(candidate.traceFlags)) {
return undefined;
}
return {
traceId: candidate.traceId,
...(candidate.spanId ? { spanId: candidate.spanId } : {}),
...(candidate.parentSpanId ? { parentSpanId: candidate.parentSpanId } : {}),
...(candidate.traceFlags ? { traceFlags: candidate.traceFlags } : {}),
};
}
function assignOtelLogEventAttributes(
attributes: Record<string, string | number | boolean>,
eventAttributes: Record<string, string | number | boolean> | undefined,
): void {
if (!eventAttributes) {
return;
}
for (const rawKey in eventAttributes) {
if (Object.keys(attributes).length >= MAX_OTEL_LOG_ATTRIBUTE_COUNT) {
break;
}
if (!Object.hasOwn(eventAttributes, rawKey)) {
continue;
}
const key = rawKey.trim();
if (BLOCKED_OTEL_LOG_ATTRIBUTE_KEYS.has(key)) {
continue;
}
if (redactSensitiveText(key) !== key) {
continue;
}
if (!OTEL_LOG_RAW_ATTRIBUTE_KEY_RE.test(key)) {
continue;
}
assignOtelLogAttribute(attributes, `openclaw.${key}`, eventAttributes[rawKey]);
}
}
function traceFlagsToOtel(traceFlags: string | undefined): TraceFlags {
const parsed = Number.parseInt(traceFlags ?? "00", 16);
return (parsed & TraceFlags.SAMPLED) !== 0 ? TraceFlags.SAMPLED : TraceFlags.NONE;
}
function contextForTraceContext(traceContext: DiagnosticTraceContext | undefined) {
const normalized = normalizeTraceContext(traceContext);
if (!normalized?.spanId) {
return undefined;
}
return trace.setSpanContext(otelContextApi.active(), {
traceId: normalized.traceId,
spanId: normalized.spanId,
traceFlags: traceFlagsToOtel(normalized.traceFlags),
isRemote: true,
});
}
function contextForTrustedTraceContext(
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) {
return metadata.trusted ? contextForTraceContext(evt.trace) : undefined;
}
function addTraceAttributes(
attributes: Record<string, string | number | boolean>,
traceContext: DiagnosticTraceContext | undefined,
): void {
const normalized = normalizeTraceContext(traceContext);
if (!normalized) {
return;
}
attributes["openclaw.traceId"] = normalized.traceId;
if (normalized.spanId) {
attributes["openclaw.spanId"] = normalized.spanId;
}
if (normalized.parentSpanId) {
attributes["openclaw.parentSpanId"] = normalized.parentSpanId;
}
if (normalized.traceFlags) {
attributes["openclaw.traceFlags"] = normalized.traceFlags;
}
}
export function createDiagnosticsOtelService(): OpenClawPluginService {
let sdk: NodeSDK | null = null;
let logProvider: LoggerProvider | null = null;
let unsubscribe: (() => void) | null = null;
let stopActiveTrustedSpans: (() => void) | null = null;
let unregisterUnhandledRejectionHandler: (() => void) | null = null;
const stopStarted = async () => {
const currentUnsubscribe = unsubscribe;
const currentLogProvider = logProvider;
const currentSdk = sdk;
const currentStopActiveTrustedSpans = stopActiveTrustedSpans;
const currentUnregisterUnhandledRejectionHandler = unregisterUnhandledRejectionHandler;
unsubscribe = null;
logProvider = null;
sdk = null;
stopActiveTrustedSpans = null;
unregisterUnhandledRejectionHandler = null;
currentUnregisterUnhandledRejectionHandler?.();
currentUnsubscribe?.();
currentStopActiveTrustedSpans?.();
if (currentLogProvider) {
await currentLogProvider.shutdown().catch(() => undefined);
}
if (currentSdk) {
await currentSdk.shutdown().catch(() => undefined);
}
};
return {
id: "diagnostics-otel",
async start(ctx) {
await stopStarted();
const cfg = ctx.config.diagnostics;
const otel = cfg?.otel;
if (!cfg || cfg.enabled === false || !otel?.enabled) {
return;
}
const emitExporterEvent = (
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts">,
) => {
try {
ctx.internalDiagnostics?.emit({
type: "telemetry.exporter",
...event,
});
} catch {
// Exporter health must never affect the exporter lifecycle.
}
};
const emitForSignals = (
signals: TelemetryExporterDiagnosticEvent["signal"][],
event: Omit<TelemetryExporterDiagnosticEvent, "type" | "seq" | "ts" | "signal">,
) => {
for (const signal of signals) {
emitExporterEvent({ signal, ...event });
}
};
const tracesEnabled = otel.traces !== false;
const metricsEnabled = otel.metrics !== false;
const logsEnabled = otel.logs === true;
const enabledSignals: TelemetryExporterDiagnosticEvent["signal"][] = [
...(tracesEnabled ? (["traces"] as const) : []),
...(metricsEnabled ? (["metrics"] as const) : []),
...(logsEnabled ? (["logs"] as const) : []),
];
if (enabledSignals.length === 0) {
return;
}
const protocol = otel.protocol ?? process.env.OTEL_EXPORTER_OTLP_PROTOCOL ?? "http/protobuf";
if (protocol !== "http/protobuf") {
emitForSignals(enabledSignals, {
exporter: "diagnostics-otel",
status: "failure",
reason: "unsupported_protocol",
});
ctx.logger.warn(`diagnostics-otel: unsupported protocol ${protocol}`);
return;
}
const endpoint = normalizeEndpoint(
otel.endpoint ?? process.env[OTEL_EXPORTER_OTLP_ENDPOINT_ENV],
);
const headers = otel.headers ?? undefined;
const serviceName =
otel.serviceName?.trim() || process.env.OTEL_SERVICE_NAME || DEFAULT_SERVICE_NAME;
const sampleRate = resolveSampleRate(otel.sampleRate);
const contentCapturePolicy = resolveContentCapturePolicy(otel.captureContent);
const sdkPreloaded = hasPreloadedOtelSdk();
const resource = resourceFromAttributes({
[ATTR_SERVICE_NAME]: serviceName,
});
const logUrl = resolveSignalOtelUrl({
signalEndpoint: otel.logsEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_LOGS_ENDPOINT_ENV],
endpoint,
path: "v1/logs",
});
if (!sdkPreloaded && (tracesEnabled || metricsEnabled)) {
const traceUrl = resolveSignalOtelUrl({
signalEndpoint: otel.tracesEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_TRACES_ENDPOINT_ENV],
endpoint,
path: "v1/traces",
});
const metricUrl = resolveSignalOtelUrl({
signalEndpoint: otel.metricsEndpoint,
signalEnvEndpoint: process.env[OTEL_EXPORTER_OTLP_METRICS_ENDPOINT_ENV],
endpoint,
path: "v1/metrics",
});
const traceExporter = tracesEnabled
? new OTLPTraceExporter({
...(traceUrl ? { url: traceUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
const spanProcessors =
traceExporter && typeof otel.flushIntervalMs === "number"
? [
new BatchSpanProcessor(traceExporter, {
scheduledDelayMillis: Math.max(1000, otel.flushIntervalMs),
}),
]
: undefined;
const metricExporter = metricsEnabled
? new OTLPMetricExporter({
...(metricUrl ? { url: metricUrl } : {}),
...(headers ? { headers } : {}),
})
: undefined;
const metricReader = metricExporter
? new PeriodicExportingMetricReader({
exporter: metricExporter,
...(typeof otel.flushIntervalMs === "number"
? { exportIntervalMillis: Math.max(1000, otel.flushIntervalMs) }
: {}),
})
: undefined;
sdk = new NodeSDK({
resource,
...(spanProcessors ? { spanProcessors } : traceExporter ? { traceExporter } : {}),
...(metricReader ? { metricReader } : {}),
...(sampleRate !== undefined
? {
sampler: new ParentBasedSampler({
root: new TraceIdRatioBasedSampler(sampleRate),
}),
}
: {}),
});
try {
sdk.start();
} catch (err) {
emitForSignals(
[
...(tracesEnabled ? (["traces"] as const) : []),
...(metricsEnabled ? (["metrics"] as const) : []),
],
{
exporter: "diagnostics-otel",
status: "failure",
reason: "start_failed",
errorCategory: errorCategory(err),
},
);
await stopStarted();
ctx.logger.error(`diagnostics-otel: failed to start SDK: ${formatError(err)}`);
throw err;
}
} else if (sdkPreloaded && (tracesEnabled || metricsEnabled)) {
ctx.logger.info("diagnostics-otel: using preloaded OpenTelemetry SDK");
}
const logSeverityMap: Record<string, SeverityNumber> = {
TRACE: 1 as SeverityNumber,
DEBUG: 5 as SeverityNumber,
INFO: 9 as SeverityNumber,
WARN: 13 as SeverityNumber,
ERROR: 17 as SeverityNumber,
FATAL: 21 as SeverityNumber,
};
const meter = metrics.getMeter("openclaw");
const tracer = trace.getTracer("openclaw");
const activeTrustedSpans = new Map<string, ReturnType<typeof tracer.startSpan>>();
const activeTrustedSpanAliases = new Map<string, ReturnType<typeof tracer.startSpan>>();
const pendingTrustedRunFinalizers = new Map<string, ReturnType<typeof setImmediate>>();
stopActiveTrustedSpans = () => {
const stopAt = Date.now();
for (const handle of pendingTrustedRunFinalizers.values()) {
clearImmediate(handle);
}
pendingTrustedRunFinalizers.clear();
for (const span of new Set([
...activeTrustedSpans.values(),
...activeTrustedSpanAliases.values(),
])) {
span.end(stopAt);
}
activeTrustedSpans.clear();
activeTrustedSpanAliases.clear();
};
const tokensCounter = meter.createCounter("openclaw.tokens", {
unit: "1",
description: "Token usage by type",
});
const genAiTokenUsageHistogram = meter.createHistogram("gen_ai.client.token.usage", {
unit: "{token}",
description: "Number of input and output tokens used by GenAI client operations",
advice: {
explicitBucketBoundaries: GEN_AI_TOKEN_USAGE_BUCKETS,
},
});
const genAiOperationDurationHistogram = meter.createHistogram(
"gen_ai.client.operation.duration",
{
unit: "s",
description: "GenAI client operation duration",
advice: {
explicitBucketBoundaries: GEN_AI_OPERATION_DURATION_BUCKETS,
},
},
);
const costCounter = meter.createCounter("openclaw.cost.usd", {
unit: "1",
description: "Estimated model cost (USD)",
});
const durationHistogram = meter.createHistogram("openclaw.run.duration_ms", {
unit: "ms",
description: "Agent run duration",
});
const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
unit: "ms",
description: "Agent harness lifecycle duration",
});
const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
unit: "1",
description: "Context window size and usage",
});
const webhookReceivedCounter = meter.createCounter("openclaw.webhook.received", {
unit: "1",
description: "Webhook requests received",
});
const webhookErrorCounter = meter.createCounter("openclaw.webhook.error", {
unit: "1",
description: "Webhook processing errors",
});
const webhookDurationHistogram = meter.createHistogram("openclaw.webhook.duration_ms", {
unit: "ms",
description: "Webhook processing duration",
});
const messageQueuedCounter = meter.createCounter("openclaw.message.queued", {
unit: "1",
description: "Messages queued for processing",
});
const messageReceivedCounter = meter.createCounter("openclaw.message.received", {
unit: "1",
description: "Inbound messages received",
});
const messageDispatchStartedCounter = meter.createCounter(
"openclaw.message.dispatch.started",
{
unit: "1",
description: "Inbound message dispatch attempts started",
},
);
const messageDispatchCompletedCounter = meter.createCounter(
"openclaw.message.dispatch.completed",
{
unit: "1",
description: "Inbound message dispatch attempts completed",
},
);
const messageDispatchDurationHistogram = meter.createHistogram(
"openclaw.message.dispatch.duration_ms",
{
unit: "ms",
description: "Inbound message dispatch duration",
},
);
const messageProcessedCounter = meter.createCounter("openclaw.message.processed", {
unit: "1",
description: "Messages processed by outcome",
});
const messageDurationHistogram = meter.createHistogram("openclaw.message.duration_ms", {
unit: "ms",
description: "Message processing duration",
});
const messageDeliveryStartedCounter = meter.createCounter(
"openclaw.message.delivery.started",
{
unit: "1",
description: "Outbound message delivery attempts started",
},
);
const messageDeliveryDurationHistogram = meter.createHistogram(
"openclaw.message.delivery.duration_ms",
{
unit: "ms",
description: "Outbound message delivery duration",
},
);
const queueDepthHistogram = meter.createHistogram("openclaw.queue.depth", {
unit: "1",
description: "Queue depth on enqueue/dequeue",
});
const queueWaitHistogram = meter.createHistogram("openclaw.queue.wait_ms", {
unit: "ms",
description: "Queue wait time before execution",
});
const laneEnqueueCounter = meter.createCounter("openclaw.queue.lane.enqueue", {
unit: "1",
description: "Command queue lane enqueue events",
});
const laneDequeueCounter = meter.createCounter("openclaw.queue.lane.dequeue", {
unit: "1",
description: "Command queue lane dequeue events",
});
const sessionStateCounter = meter.createCounter("openclaw.session.state", {
unit: "1",
description: "Session state transitions",
});
const sessionTurnCreatedCounter = meter.createCounter("openclaw.session.turn.created", {
unit: "1",
description: "Agent session turns created",
});
const sessionStuckCounter = meter.createCounter("openclaw.session.stuck", {
unit: "1",
description: "Sessions stuck in processing",
});
const sessionStuckAgeHistogram = meter.createHistogram("openclaw.session.stuck_age_ms", {
unit: "ms",
description: "Age of stuck sessions",
});
const sessionRecoveryRequestedCounter = meter.createCounter(
"openclaw.session.recovery.requested",
{
unit: "1",
description: "Session recovery attempts requested",
},
);
const sessionRecoveryCompletedCounter = meter.createCounter(
"openclaw.session.recovery.completed",
{
unit: "1",
description: "Session recovery attempts completed",
},
);
const sessionRecoveryAgeHistogram = meter.createHistogram(
"openclaw.session.recovery.age_ms",
{
unit: "ms",
description: "Age of sessions selected for recovery",
},
);
const talkEventCounter = meter.createCounter("openclaw.talk.event", {
unit: "1",
description: "Talk events emitted by type",
});
const talkEventDurationHistogram = meter.createHistogram("openclaw.talk.event.duration_ms", {
unit: "ms",
description: "Talk event duration when reported",
});
const talkAudioBytesHistogram = meter.createHistogram("openclaw.talk.audio.bytes", {
unit: "By",
description: "Talk audio frame byte lengths",
});
const runAttemptCounter = meter.createCounter("openclaw.run.attempt", {
unit: "1",
description: "Run attempts",
});
const toolLoopCounter = meter.createCounter("openclaw.tool.loop", {
unit: "1",
description: "Detected repetitive tool-call loop events",
});
const skillUsedCounter = meter.createCounter("openclaw.skill.used", {
unit: "1",
description: "Skills used by agent runs",
});
const modelCallDurationHistogram = meter.createHistogram("openclaw.model_call.duration_ms", {
unit: "ms",
description: "Model call duration",
});
const modelCallRequestBytesHistogram = meter.createHistogram(
"openclaw.model_call.request_bytes",
{
unit: "By",
description: "UTF-8 byte size of sanitized model request payloads",
},
);
const modelCallResponseBytesHistogram = meter.createHistogram(
"openclaw.model_call.response_bytes",
{
unit: "By",
description: "UTF-8 byte size of streamed model response events",
},
);
const modelCallTimeToFirstByteHistogram = meter.createHistogram(
"openclaw.model_call.time_to_first_byte_ms",
{
unit: "ms",
description: "Elapsed time before the first streamed model response event",
},
);
const modelFailoverCounter = meter.createCounter("openclaw.model.failover", {
unit: "1",
description: "Model failovers by source, destination, lane, and reason",
});
const toolExecutionDurationHistogram = meter.createHistogram(
"openclaw.tool.execution.duration_ms",
{
unit: "ms",
description: "Tool execution duration",
},
);
const toolExecutionBlockedCounter = meter.createCounter(
"openclaw.tool.execution.blocked",
{
unit: "1",
description: "Tool executions blocked by policy or sandbox diagnostics",
},
);
const execProcessDurationHistogram = meter.createHistogram("openclaw.exec.duration_ms", {
unit: "ms",
description: "Exec process duration",
});
const memoryRssHistogram = meter.createHistogram("openclaw.memory.rss_bytes", {
unit: "By",
description: "Resident set size reported by diagnostic memory samples",
});
const memoryHeapUsedHistogram = meter.createHistogram("openclaw.memory.heap_used_bytes", {
unit: "By",
description: "Heap used bytes reported by diagnostic memory samples",
});
const memoryHeapTotalHistogram = meter.createHistogram("openclaw.memory.heap_total_bytes", {
unit: "By",
description: "Heap total bytes reported by diagnostic memory samples",
});
const memoryExternalHistogram = meter.createHistogram("openclaw.memory.external_bytes", {
unit: "By",
description: "External memory bytes reported by diagnostic memory samples",
});
const memoryArrayBuffersHistogram = meter.createHistogram(
"openclaw.memory.array_buffers_bytes",
{
unit: "By",
description: "ArrayBuffer bytes reported by diagnostic memory samples",
},
);
const memoryPressureCounter = meter.createCounter("openclaw.memory.pressure", {
unit: "1",
description: "Diagnostic memory pressure events",
});
const asyncQueueDroppedCounter = meter.createCounter(
"openclaw.diagnostic.async_queue.dropped",
{
unit: "1",
description: "Async diagnostic queue drops by dropped event class",
},
);
const payloadLargeCounter = meter.createCounter("openclaw.payload.large", {
unit: "1",
description: "Oversized payload diagnostics by surface and action",
});
const payloadLargeBytesHistogram = meter.createHistogram("openclaw.payload.large_bytes", {
unit: "By",
description: "Oversized payload byte sizes by surface and action",
});
const livenessWarningCounter = meter.createCounter("openclaw.liveness.warning", {
unit: "1",
description: "Diagnostic liveness warning events",
});
const livenessEventLoopDelayP99Histogram = meter.createHistogram(
"openclaw.liveness.event_loop_delay_p99_ms",
{
unit: "ms",
description: "P99 event-loop delay reported by diagnostic liveness warnings",
},
);
const livenessEventLoopDelayMaxHistogram = meter.createHistogram(
"openclaw.liveness.event_loop_delay_max_ms",
{
unit: "ms",
description: "Maximum event-loop delay reported by diagnostic liveness warnings",
},
);
const livenessEventLoopUtilizationHistogram = meter.createHistogram(
"openclaw.liveness.event_loop_utilization",
{
unit: "1",
description: "Event-loop utilization reported by diagnostic liveness warnings",
},
);
const livenessCpuCoreRatioHistogram = meter.createHistogram(
"openclaw.liveness.cpu_core_ratio",
{
unit: "1",
description: "CPU core ratio reported by diagnostic liveness warnings",
},
);
const telemetryExporterCounter = meter.createCounter("openclaw.telemetry.exporter.events", {
unit: "1",
description: "Diagnostic telemetry exporter lifecycle and failure events",
});
let recordLogRecord:
| ((
evt: Extract<DiagnosticEventPayload, { type: "log.record" }>,
metadata: DiagnosticEventMetadata,
) => void)
| undefined;
if (logsEnabled) {
let logRecordExportFailureLastReportedAt = Number.NEGATIVE_INFINITY;
const logExporter = new OTLPLogExporter({
...(logUrl ? { url: logUrl } : {}),
...(headers ? { headers } : {}),
});
const logProcessor = new BatchLogRecordProcessor(
logExporter,
typeof otel.flushIntervalMs === "number"
? { scheduledDelayMillis: Math.max(1000, otel.flushIntervalMs) }
: {},
);
logProvider = new LoggerProvider({
resource,
processors: [logProcessor],
});
const otelLogger = logProvider.getLogger("openclaw");
recordLogRecord = (evt, metadata) => {
try {
const logLevelName = evt.level || "INFO";
const severityNumber = logSeverityMap[logLevelName] ?? (9 as SeverityNumber);
const body = shouldCaptureOtelLogBody(contentCapturePolicy)
? normalizeOtelLogString(evt.message || "log", MAX_OTEL_LOG_BODY_CHARS)
: "log";
const attributes = Object.create(null) as Record<string, string | number | boolean>;
assignOtelLogAttribute(attributes, "openclaw.log.level", logLevelName);
if (evt.loggerName) {
assignOtelLogAttribute(attributes, "openclaw.logger", evt.loggerName);
}
if (evt.loggerParents?.length) {
assignOtelLogAttribute(
attributes,
"openclaw.logger.parents",
evt.loggerParents.join("."),
);
}
assignOtelLogEventAttributes(attributes, evt.attributes);
if (evt.code?.line) {
assignOtelLogAttribute(attributes, "code.lineno", evt.code.line);
}
if (evt.code?.functionName) {
assignOtelLogAttribute(attributes, "code.function", evt.code.functionName);
}
if (metadata.trusted) {
addTraceAttributes(attributes, evt.trace);
}
const logRecord: LogRecord = {
body,
severityText: logLevelName,
severityNumber,
attributes: redactOtelAttributes(attributes),
timestamp: evt.ts,
};
const logContext = contextForTrustedTraceContext(evt, metadata);
if (logContext) {
logRecord.context = logContext;
}
otelLogger.emit(logRecord);
} catch (err) {
emitExporterEvent({
exporter: "diagnostics-otel",
signal: "logs",
status: "failure",
reason: "emit_failed",
errorCategory: errorCategory(err),
});
const now = Date.now();
if (
now - logRecordExportFailureLastReportedAt >=
LOG_RECORD_EXPORT_FAILURE_REPORT_INTERVAL_MS
) {
logRecordExportFailureLastReportedAt = now;
ctx.logger.error(`diagnostics-otel: log record export failed: ${formatError(err)}`);
}
}
};
}
const spanWithDuration = (
name: string,
attributes: Record<string, string | number | boolean>,
durationMs?: number,
options: {
parentContext?: ReturnType<typeof contextForTraceContext> | null;
endTimeMs?: number;
kind?: SpanKind;
startTimeMs?: number;
} = {},
) => {
const endTimeMs = options.endTimeMs ?? Date.now();
const startTime =
typeof options.startTimeMs === "number"
? options.startTimeMs
: typeof durationMs === "number" && durationMs >= 0
? endTimeMs - durationMs
: undefined;
const parentContext =
"parentContext" in options ? (options.parentContext ?? undefined) : undefined;
const span = tracer.startSpan(
name,
{
attributes: redactOtelAttributes(attributes),
...(options.kind !== undefined ? { kind: options.kind } : {}),
...(startTime !== undefined ? { startTime } : {}),
},
parentContext,
);
return span;
};
const trustedTraceContext = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => (metadata.trusted ? normalizeTraceContext(evt.trace) : undefined);
const activeTrustedParentContext = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => {
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
if (!parentSpanId) {
return undefined;
}
const activeParentSpan =
activeTrustedSpans.get(parentSpanId) ?? activeTrustedSpanAliases.get(parentSpanId);
if (!activeParentSpan) {
return undefined;
}
return trace.setSpanContext(otelContextApi.active(), activeParentSpan.spanContext());
};
const trackTrustedSpan = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
span: ReturnType<typeof tracer.startSpan>,
) => {
const spanId = trustedTraceContext(evt, metadata)?.spanId;
if (spanId) {
activeTrustedSpans.set(spanId, span);
}
return span;
};
const takeTrackedTrustedSpan = (
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
) => {
const spanId = trustedTraceContext(evt, metadata)?.spanId;
if (!spanId) {
return undefined;
}
const span = activeTrustedSpans.get(spanId);
if (span) {
activeTrustedSpans.delete(spanId);
}
return span;
};
const setSpanAttrs = (
span: ReturnType<typeof tracer.startSpan>,
attributes: Record<string, string | number | boolean>,
) => {
span.setAttributes?.(redactOtelAttributes(attributes));
};
const scheduleTrackedRunSpanFinalize = (
spanId: string,
parentSpanId: string | undefined,
span: ReturnType<typeof tracer.startSpan>,
endTimeMs: number,
) => {
const existingHandle = pendingTrustedRunFinalizers.get(spanId);
if (existingHandle) {
clearImmediate(existingHandle);
}
const handle = setImmediate(() => {
pendingTrustedRunFinalizers.delete(spanId);
if (activeTrustedSpans.get(spanId) === span) {
activeTrustedSpans.delete(spanId);
}
if (parentSpanId && activeTrustedSpanAliases.get(parentSpanId) === span) {
activeTrustedSpanAliases.delete(parentSpanId);
}
span.end(endTimeMs);
});
pendingTrustedRunFinalizers.set(spanId, handle);
};
const addRunAttrs = (
spanAttrs: Record<string, string | number | boolean>,
evt: {
runId?: string;
sessionKey?: string;
sessionId?: string;
provider?: string;
model?: string;
channel?: string;
trigger?: string;
},
) => {
if (evt.provider) {
spanAttrs["openclaw.provider"] = evt.provider;
}
if (evt.model) {
spanAttrs["openclaw.model"] = evt.model;
}
if (evt.channel) {
spanAttrs["openclaw.channel"] = evt.channel;
}
if (evt.trigger) {
spanAttrs["openclaw.trigger"] = evt.trigger;
}
};
const paramsSummaryAttrs = (
summary: Extract<
DiagnosticEventPayload,
{ type: "tool.execution.started" }
>["paramsSummary"],
): Record<string, string | number> => {
if (!summary) {
return {};
}
return {
"openclaw.tool.params.kind": summary.kind,
...("length" in summary ? { "openclaw.tool.params.length": summary.length } : {}),
};
};
const recordModelUsage = (
evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
metadata: DiagnosticEventMetadata,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.agent": lowCardinalityAttr(evt.agentId),
"openclaw.provider": evt.provider ?? "unknown",
"openclaw.model": evt.model ?? "unknown",
};
const genAiAttrs: Record<string, string> = {
"gen_ai.operation.name": "chat",
"gen_ai.provider.name": lowCardinalityAttr(evt.provider),
"gen_ai.request.model": lowCardinalityAttr(evt.model),
};
const usage = evt.usage;
if (usage.input) {
tokensCounter.add(usage.input, { ...attrs, "openclaw.token": "input" });
genAiTokenUsageHistogram.record(usage.input, {
...genAiAttrs,
"gen_ai.token.type": "input",
});
}
if (usage.output) {
tokensCounter.add(usage.output, { ...attrs, "openclaw.token": "output" });
genAiTokenUsageHistogram.record(usage.output, {
...genAiAttrs,
"gen_ai.token.type": "output",
});
}
if (usage.cacheRead) {
tokensCounter.add(usage.cacheRead, { ...attrs, "openclaw.token": "cache_read" });
}
if (usage.cacheWrite) {
tokensCounter.add(usage.cacheWrite, { ...attrs, "openclaw.token": "cache_write" });
}
if (usage.promptTokens) {
tokensCounter.add(usage.promptTokens, { ...attrs, "openclaw.token": "prompt" });
}
if (usage.total) {
tokensCounter.add(usage.total, { ...attrs, "openclaw.token": "total" });
}
if (evt.costUsd) {
costCounter.add(evt.costUsd, attrs);
}
if (evt.durationMs) {
durationHistogram.record(evt.durationMs, attrs);
}
if (evt.context?.limit) {
contextHistogram.record(evt.context.limit, {
...attrs,
"openclaw.context": "limit",
});
}
if (evt.context?.used) {
contextHistogram.record(evt.context.used, {
...attrs,
"openclaw.context": "used",
});
}
if (!tracesEnabled) {
return;
}
const genAiInputTokens =
usage.promptTokens ??
(usage.input ?? 0) + (usage.cacheRead ?? 0) + (usage.cacheWrite ?? 0);
const spanAttrs: Record<string, string | number> = {
...attrs,
"openclaw.tokens.input": usage.input ?? 0,
"openclaw.tokens.output": usage.output ?? 0,
"openclaw.tokens.cache_read": usage.cacheRead ?? 0,
"openclaw.tokens.cache_write": usage.cacheWrite ?? 0,
"openclaw.tokens.total": usage.total ?? 0,
};
assignGenAiSpanIdentityAttrs(spanAttrs, evt);
assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.input_tokens", genAiInputTokens);
assignPositiveNumberAttr(spanAttrs, "gen_ai.usage.output_tokens", usage.output);
assignPositiveNumberAttr(
spanAttrs,
"gen_ai.usage.cache_read.input_tokens",
usage.cacheRead,
);
assignPositiveNumberAttr(
spanAttrs,
"gen_ai.usage.cache_creation.input_tokens",
usage.cacheWrite,
);
const span = spanWithDuration("openclaw.model.usage", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
span.end(evt.ts);
};
const recordWebhookReceived = (
evt: Extract<DiagnosticEventPayload, { type: "webhook.received" }>,
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.webhook": evt.updateType ?? "unknown",
};
webhookReceivedCounter.add(1, attrs);
};
const recordWebhookProcessed = (
evt: Extract<DiagnosticEventPayload, { type: "webhook.processed" }>,
) => {
const attrs = {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.webhook": lowCardinalityAttr(evt.updateType),
};
if (typeof evt.durationMs === "number") {
webhookDurationHistogram.record(evt.durationMs, attrs);
}
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = { ...attrs };
const span = spanWithDuration("openclaw.webhook.processed", spanAttrs, evt.durationMs);
span.end();
};
const recordWebhookError = (
evt: Extract<DiagnosticEventPayload, { type: "webhook.error" }>,
) => {
const attrs = {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.webhook": lowCardinalityAttr(evt.updateType),
};
webhookErrorCounter.add(1, attrs);
if (!tracesEnabled) {
return;
}
const redactedError = redactSensitiveText(evt.error);
const spanAttrs: Record<string, string | number> = {
...attrs,
"openclaw.error": redactedError,
};
const span = tracer.startSpan("openclaw.webhook.error", {
attributes: spanAttrs,
});
span.setStatus({ code: SpanStatusCode.ERROR, message: redactedError });
span.end();
};
const recordMessageQueued = (
evt: Extract<DiagnosticEventPayload, { type: "message.queued" }>,
) => {
const attrs = {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.source": lowCardinalityAttr(evt.source),
};
messageQueuedCounter.add(1, attrs);
if (typeof evt.queueDepth === "number") {
queueDepthHistogram.record(evt.queueDepth, attrs);
}
};
const recordMessageReceived = (
evt: Extract<DiagnosticEventPayload, { type: "message.received" }>,
) => {
messageReceivedCounter.add(1, {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.source": lowCardinalityAttr(evt.source),
});
};
const recordMessageDispatchStarted = (
evt: Extract<DiagnosticEventPayload, { type: "message.dispatch.started" }>,
) => {
messageDispatchStartedCounter.add(1, {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.source": lowCardinalityAttr(evt.source),
});
};
const recordMessageDispatchCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "message.dispatch.completed" }>,
) => {
const attrs = {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.outcome": evt.outcome,
"openclaw.reason": lowCardinalityAttr(evt.reason, "none"),
"openclaw.source": lowCardinalityAttr(evt.source),
};
messageDispatchCompletedCounter.add(1, attrs);
messageDispatchDurationHistogram.record(evt.durationMs, attrs);
};
const recordMessageProcessed = (
evt: Extract<DiagnosticEventPayload, { type: "message.processed" }>,
) => {
const attrs = {
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.outcome": evt.outcome ?? "unknown",
};
messageProcessedCounter.add(1, attrs);
if (typeof evt.durationMs === "number") {
messageDurationHistogram.record(evt.durationMs, attrs);
}
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = { ...attrs };
if (evt.reason) {
spanAttrs["openclaw.reason"] = lowCardinalityAttr(evt.reason, "unknown");
}
const span = spanWithDuration("openclaw.message.processed", spanAttrs, evt.durationMs);
if (evt.outcome === "error" && evt.error) {
span.setStatus({ code: SpanStatusCode.ERROR, message: redactSensitiveText(evt.error) });
}
span.end();
};
const messageDeliveryAttrs = (
evt: MessageDeliveryDiagnosticEvent,
): Record<string, string> => ({
"openclaw.channel": lowCardinalityAttr(evt.channel),
"openclaw.delivery.kind": lowCardinalityAttr(evt.deliveryKind, "other"),
});
const recordMessageDeliveryStarted = (
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.started" }>,
) => {
messageDeliveryStartedCounter.add(1, messageDeliveryAttrs(evt));
};
const recordMessageDeliveryCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.completed" }>,
) => {
const attrs = {
...messageDeliveryAttrs(evt),
"openclaw.outcome": "completed",
};
messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const span = spanWithDuration(
"openclaw.message.delivery",
{
...attrs,
"openclaw.delivery.result_count": evt.resultCount,
},
evt.durationMs,
{ endTimeMs: evt.ts },
);
span.end(evt.ts);
};
const recordMessageDeliveryError = (
evt: Extract<DiagnosticEventPayload, { type: "message.delivery.error" }>,
) => {
const attrs = {
...messageDeliveryAttrs(evt),
"openclaw.outcome": "error",
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
};
messageDeliveryDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const span = spanWithDuration("openclaw.message.delivery", attrs, evt.durationMs, {
endTimeMs: evt.ts,
});
span.setStatus({
code: SpanStatusCode.ERROR,
message: redactSensitiveText(evt.errorCategory),
});
span.end(evt.ts);
};
const recordRunStarted = (
evt: Extract<DiagnosticEventPayload, { type: "run.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {};
addRunAttrs(spanAttrs, evt);
const span = trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.run", spanAttrs, undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
const parentSpanId = trustedTraceContext(evt, metadata)?.parentSpanId;
if (parentSpanId && !activeTrustedSpans.has(parentSpanId)) {
activeTrustedSpanAliases.set(parentSpanId, span);
}
};
const recordLaneEnqueue = (
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.enqueue" }>,
) => {
const attrs = { "openclaw.lane": lowCardinalityQueueLaneAttr(evt.lane) };
laneEnqueueCounter.add(1, attrs);
queueDepthHistogram.record(evt.queueSize, attrs);
};
const recordLaneDequeue = (
evt: Extract<DiagnosticEventPayload, { type: "queue.lane.dequeue" }>,
) => {
const attrs = { "openclaw.lane": lowCardinalityQueueLaneAttr(evt.lane) };
laneDequeueCounter.add(1, attrs);
queueDepthHistogram.record(evt.queueSize, attrs);
if (typeof evt.waitMs === "number") {
queueWaitHistogram.record(evt.waitMs, attrs);
}
};
const recordSessionState = (
evt: Extract<DiagnosticEventPayload, { type: "session.state" }>,
) => {
const attrs: Record<string, string> = { "openclaw.state": evt.state };
if (evt.reason) {
attrs["openclaw.reason"] = redactSensitiveText(evt.reason);
}
sessionStateCounter.add(1, attrs);
};
const recordSessionTurnCreated = (
evt: Extract<DiagnosticEventPayload, { type: "session.turn.created" }>,
) => {
sessionTurnCreatedCounter.add(1, {
"openclaw.agent": lowCardinalityAttr(evt.agentId, "unknown"),
"openclaw.channel": lowCardinalityAttr(evt.channel, "unknown"),
"openclaw.trigger": evt.trigger,
});
};
const recordSessionStuck = (
evt: Extract<DiagnosticEventPayload, { type: "session.stuck" }>,
) => {
const attrs: Record<string, string> = { "openclaw.state": evt.state };
sessionStuckCounter.add(1, attrs);
if (typeof evt.ageMs === "number") {
sessionStuckAgeHistogram.record(evt.ageMs, attrs);
}
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = { ...attrs };
spanAttrs["openclaw.queueDepth"] = evt.queueDepth ?? 0;
spanAttrs["openclaw.ageMs"] = evt.ageMs;
const span = tracer.startSpan("openclaw.session.stuck", { attributes: spanAttrs });
span.setStatus({ code: SpanStatusCode.ERROR, message: "session stuck" });
span.end();
};
const sessionRecoveryAttrs = (evt: SessionRecoveryDiagnosticEvent) => {
const attrs: Record<string, string> = { "openclaw.state": evt.state };
if (evt.reason) {
attrs["openclaw.reason"] = redactSensitiveText(evt.reason);
}
if (evt.activeWorkKind) {
attrs["openclaw.active_work_kind"] = evt.activeWorkKind;
}
return attrs;
};
const recordSessionRecoveryRequested = (
evt: Extract<DiagnosticEventPayload, { type: "session.recovery.requested" }>,
) => {
const attrs = sessionRecoveryAttrs(evt);
attrs["openclaw.action"] = evt.allowActiveAbort ? "abort" : "recover";
sessionRecoveryRequestedCounter.add(1, attrs);
sessionRecoveryAgeHistogram.record(evt.ageMs, attrs);
};
const recordSessionRecoveryCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "session.recovery.completed" }>,
) => {
const attrs = sessionRecoveryAttrs(evt);
attrs["openclaw.status"] = evt.status;
attrs["openclaw.action"] = lowCardinalityAttr(evt.action, "unknown");
if (evt.outcomeReason) {
attrs["openclaw.reason"] = redactSensitiveText(evt.outcomeReason);
}
sessionRecoveryCompletedCounter.add(1, attrs);
sessionRecoveryAgeHistogram.record(evt.ageMs, attrs);
};
const talkEventAttrs = (evt: TalkDiagnosticEvent): Record<string, string> => ({
"openclaw.talk.brain": lowCardinalityAttr(evt.brain),
"openclaw.talk.event_type": lowCardinalityAttr(evt.talkEventType),
"openclaw.talk.mode": lowCardinalityAttr(evt.mode),
"openclaw.talk.provider": lowCardinalityAttr(evt.provider),
"openclaw.talk.transport": lowCardinalityAttr(evt.transport),
});
const recordTalkEvent = (evt: TalkDiagnosticEvent, metadata: DiagnosticEventMetadata) => {
if (!metadata.trusted) {
return;
}
const attrs = talkEventAttrs(evt);
talkEventCounter.add(1, attrs);
if (typeof evt.durationMs === "number") {
talkEventDurationHistogram.record(evt.durationMs, attrs);
}
if (typeof evt.byteLength === "number") {
talkAudioBytesHistogram.record(evt.byteLength, attrs);
}
};
const recordRunAttempt = (evt: Extract<DiagnosticEventPayload, { type: "run.attempt" }>) => {
runAttemptCounter.add(1, { "openclaw.attempt": evt.attempt });
};
const toolLoopAttrs = (
evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>,
): Record<string, string | number> => ({
"openclaw.toolName": lowCardinalityAttr(evt.toolName, "tool"),
"openclaw.loop.level": evt.level,
"openclaw.loop.action": evt.action,
"openclaw.loop.detector": evt.detector,
"openclaw.loop.count": evt.count,
...(evt.pairedToolName
? { "openclaw.loop.paired_tool": lowCardinalityAttr(evt.pairedToolName, "tool") }
: {}),
});
const recordToolLoop = (evt: Extract<DiagnosticEventPayload, { type: "tool.loop" }>) => {
const attrs = toolLoopAttrs(evt);
toolLoopCounter.add(1, attrs);
if (!tracesEnabled) {
return;
}
const span = spanWithDuration("openclaw.tool.loop", attrs, 0, { endTimeMs: evt.ts });
if (evt.level === "critical" || evt.action === "block") {
span.setStatus({
code: SpanStatusCode.ERROR,
message: `${evt.detector}:${evt.action}`,
});
}
span.end(evt.ts);
};
const recordMemoryUsageMetrics = (
evt: Extract<
DiagnosticEventPayload,
{ type: "diagnostic.memory.sample" | "diagnostic.memory.pressure" }
>,
attrs: Record<string, string> = {},
) => {
memoryRssHistogram.record(evt.memory.rssBytes, attrs);
memoryHeapUsedHistogram.record(evt.memory.heapUsedBytes, attrs);
memoryHeapTotalHistogram.record(evt.memory.heapTotalBytes, attrs);
memoryExternalHistogram.record(evt.memory.externalBytes, attrs);
memoryArrayBuffersHistogram.record(evt.memory.arrayBuffersBytes, attrs);
};
const recordMemorySample = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.sample" }>,
) => {
recordMemoryUsageMetrics(evt);
};
const recordMemoryPressure = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.memory.pressure" }>,
) => {
const attrs = {
"openclaw.memory.level": evt.level,
"openclaw.memory.reason": evt.reason,
};
memoryPressureCounter.add(1, attrs);
recordMemoryUsageMetrics(evt, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...attrs,
"openclaw.memory.rss_bytes": evt.memory.rssBytes,
"openclaw.memory.heap_used_bytes": evt.memory.heapUsedBytes,
"openclaw.memory.heap_total_bytes": evt.memory.heapTotalBytes,
"openclaw.memory.external_bytes": evt.memory.externalBytes,
"openclaw.memory.array_buffers_bytes": evt.memory.arrayBuffersBytes,
...(evt.thresholdBytes !== undefined
? { "openclaw.memory.threshold_bytes": evt.thresholdBytes }
: {}),
...(evt.rssGrowthBytes !== undefined
? { "openclaw.memory.rss_growth_bytes": evt.rssGrowthBytes }
: {}),
...(evt.windowMs !== undefined ? { "openclaw.memory.window_ms": evt.windowMs } : {}),
};
const span = spanWithDuration("openclaw.memory.pressure", spanAttrs, 0, {
endTimeMs: evt.ts,
});
if (evt.level === "critical") {
span.setStatus({
code: SpanStatusCode.ERROR,
message: evt.reason,
});
}
span.end(evt.ts);
};
const recordAsyncQueueDropped = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.async_queue.dropped" }>,
) => {
asyncQueueDroppedCounter.add(evt.droppedEvents, {
"openclaw.diagnostic.async_queue.drop_class": "total",
});
if (evt.droppedTrustedEvents !== undefined) {
asyncQueueDroppedCounter.add(evt.droppedTrustedEvents, {
"openclaw.diagnostic.async_queue.drop_class": "trusted",
});
}
if (evt.droppedUntrustedEvents !== undefined) {
asyncQueueDroppedCounter.add(evt.droppedUntrustedEvents, {
"openclaw.diagnostic.async_queue.drop_class": "untrusted",
});
}
if (evt.droppedPriorityEvents !== undefined) {
asyncQueueDroppedCounter.add(evt.droppedPriorityEvents, {
"openclaw.diagnostic.async_queue.drop_class": "priority",
});
}
};
const recordRunCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "run.completed" }>,
metadata: DiagnosticEventMetadata,
) => {
const attrs: Record<string, string | number> = {
"openclaw.outcome": evt.outcome,
"openclaw.provider": evt.provider ?? "unknown",
"openclaw.model": evt.model ?? "unknown",
};
if (evt.channel) {
attrs["openclaw.channel"] = evt.channel;
}
if (evt.blockedBy) {
attrs["openclaw.blocked_by"] = lowCardinalityAttr(evt.blockedBy, "unknown");
}
durationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.outcome": evt.outcome,
};
addRunAttrs(spanAttrs, evt);
if (evt.blockedBy) {
spanAttrs["openclaw.blocked_by"] = lowCardinalityAttr(evt.blockedBy, "unknown");
}
if (evt.errorCategory) {
spanAttrs["openclaw.errorCategory"] = lowCardinalityAttr(evt.errorCategory, "other");
}
const trustedTrace = trustedTraceContext(evt, metadata);
const trackedSpan = trustedTrace?.spanId
? activeTrustedSpans.get(trustedTrace.spanId)
: undefined;
const span =
trackedSpan ??
spanWithDuration("openclaw.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
if (evt.outcome === "error") {
span.setStatus({
code: SpanStatusCode.ERROR,
...(evt.errorCategory ? { message: redactSensitiveText(evt.errorCategory) } : {}),
});
}
if (trackedSpan && trustedTrace?.spanId) {
scheduleTrackedRunSpanFinalize(
trustedTrace.spanId,
trustedTrace.parentSpanId,
trackedSpan,
evt.ts,
);
return;
}
span.end(evt.ts);
};
const harnessRunMetricAttrs = (evt: HarnessRunDiagnosticEvent) => ({
"openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
"openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
...(evt.type === "harness.run.started"
? {}
: {
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
}),
"openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
"openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
});
const recordHarnessRunStarted = (
evt: Extract<DiagnosticEventPayload, { type: "harness.run.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.harness.run", harnessRunMetricAttrs(evt), undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordHarnessRunCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
metadata: DiagnosticEventMetadata,
) => {
harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...harnessRunMetricAttrs(evt),
};
if (evt.resultClassification) {
spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
evt.resultClassification,
);
}
if (typeof evt.yieldDetected === "boolean") {
spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
}
if (evt.itemLifecycle) {
spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
}
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
if (evt.outcome === "error") {
span.setStatus({
code: SpanStatusCode.ERROR,
message: "error",
});
}
span.end(evt.ts);
};
const recordHarnessRunError = (
evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
metadata: DiagnosticEventMetadata,
) => {
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
const attrs = {
...harnessRunMetricAttrs(evt),
"openclaw.harness.phase": evt.phase,
"openclaw.errorCategory": errorType,
};
harnessDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...attrs,
"error.type": errorType,
...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
};
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.setStatus({
code: SpanStatusCode.ERROR,
message: errorType,
});
span.end(evt.ts);
};
const recordContextAssembled = (
evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.context.message_count": evt.messageCount,
"openclaw.context.history_text_chars": evt.historyTextChars,
"openclaw.context.history_image_blocks": evt.historyImageBlocks,
"openclaw.context.max_message_text_chars": evt.maxMessageTextChars,
"openclaw.context.system_prompt_chars": evt.systemPromptChars,
"openclaw.context.prompt_chars": evt.promptChars,
"openclaw.context.prompt_images": evt.promptImages,
};
addRunAttrs(spanAttrs, evt);
if (evt.contextTokenBudget !== undefined) {
spanAttrs["openclaw.context.token_budget"] = evt.contextTokenBudget;
}
if (evt.reserveTokens !== undefined) {
spanAttrs["openclaw.context.reserve_tokens"] = evt.reserveTokens;
}
const span = spanWithDuration("openclaw.context.assembled", spanAttrs, 0, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
span.end(evt.ts);
};
const recordModelFailover = (
evt: ModelFailoverDiagnosticEvent,
metadata: DiagnosticEventMetadata,
) => {
const metricAttrs: Record<string, string> = {
"openclaw.failover.reason": lowCardinalityAttr(evt.reason, "unknown"),
"openclaw.failover.suspended":
evt.suspended === undefined ? "unknown" : String(evt.suspended),
"openclaw.lane": lowCardinalityQueueLaneAttr(evt.lane, "unknown"),
"openclaw.model": lowCardinalityAttr(evt.fromModel),
"openclaw.provider": lowCardinalityAttr(evt.fromProvider),
"openclaw.failover.to_model": lowCardinalityAttr(evt.toModel),
"openclaw.failover.to_provider": lowCardinalityAttr(evt.toProvider),
};
modelFailoverCounter.add(1, metricAttrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.failover.reason": lowCardinalityAttr(evt.reason, "unknown"),
};
if (evt.fromProvider) {
spanAttrs["openclaw.provider"] = evt.fromProvider;
}
if (evt.fromModel) {
spanAttrs["openclaw.model"] = evt.fromModel;
}
if (evt.toProvider) {
spanAttrs["openclaw.failover.to_provider"] = evt.toProvider;
}
if (evt.toModel) {
spanAttrs["openclaw.failover.to_model"] = evt.toModel;
}
if (evt.lane) {
spanAttrs["openclaw.lane"] = lowCardinalityQueueLaneAttr(evt.lane, "unknown");
}
if (evt.suspended !== undefined) {
spanAttrs["openclaw.failover.suspended"] = evt.suspended;
}
if (evt.cascadeDepth !== undefined) {
spanAttrs["openclaw.failover.cascade_depth"] = evt.cascadeDepth;
}
const span = spanWithDuration("openclaw.model.failover", spanAttrs, 0, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
span.end(evt.ts);
};
const modelCallMetricAttrs = (evt: ModelCallLifecycleDiagnosticEvent) => ({
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
"openclaw.api": lowCardinalityAttr(evt.api),
"openclaw.transport": lowCardinalityAttr(evt.transport),
});
const genAiModelCallMetricAttrs = (
evt: ModelCallLifecycleDiagnosticEvent,
errorType?: string,
) => ({
"gen_ai.operation.name": genAiOperationName(evt.api),
"gen_ai.provider.name": lowCardinalityAttr(evt.provider),
"gen_ai.request.model": lowCardinalityAttr(evt.model),
...(errorType ? { "error.type": errorType } : {}),
});
const recordModelCallSizeTimingMetrics = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" }>,
attrs: ReturnType<typeof modelCallMetricAttrs>,
) => {
const requestPayloadBytes = positiveFiniteNumber(evt.requestPayloadBytes);
if (requestPayloadBytes !== undefined) {
modelCallRequestBytesHistogram.record(requestPayloadBytes, attrs);
}
const responseStreamBytes = positiveFiniteNumber(evt.responseStreamBytes);
if (responseStreamBytes !== undefined) {
modelCallResponseBytesHistogram.record(responseStreamBytes, attrs);
}
const timeToFirstByteMs = positiveFiniteNumber(evt.timeToFirstByteMs);
if (timeToFirstByteMs !== undefined) {
modelCallTimeToFirstByteHistogram.record(timeToFirstByteMs, attrs);
}
};
const recordModelCallStarted = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
};
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {
spanAttrs["openclaw.api"] = evt.api;
}
if (evt.transport) {
spanAttrs["openclaw.transport"] = evt.transport;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration(modelCallSpanName(evt), spanAttrs, undefined, {
kind: modelCallSpanKind(),
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordModelCallCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.completed" }>,
metadata: DiagnosticEventMetadata,
modelContent?: OtelModelCallContent,
) => {
const metricAttrs = modelCallMetricAttrs(evt);
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
recordModelCallSizeTimingMetrics(evt, metricAttrs);
genAiOperationDurationHistogram.record(
evt.durationMs / 1000,
genAiModelCallMetricAttrs(evt),
);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
};
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {
spanAttrs["openclaw.api"] = evt.api;
}
if (evt.transport) {
spanAttrs["openclaw.transport"] = evt.transport;
}
assignModelCallSizeTimingAttrs(spanAttrs, evt);
assignOtelModelContentAttributes(spanAttrs, modelContent, contentCapturePolicy);
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration(modelCallSpanName(evt), spanAttrs, evt.durationMs, {
kind: modelCallSpanKind(),
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.end(evt.ts);
};
const recordModelCallError = (
evt: Extract<DiagnosticEventPayload, { type: "model.call.error" }>,
metadata: DiagnosticEventMetadata,
modelContent?: OtelModelCallContent,
) => {
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
const metricAttrs = {
...modelCallMetricAttrs(evt),
"openclaw.errorCategory": errorType,
...(evt.failureKind
? { "openclaw.failureKind": lowCardinalityAttr(evt.failureKind, "other") }
: {}),
};
modelCallDurationHistogram.record(evt.durationMs, metricAttrs);
recordModelCallSizeTimingMetrics(evt, metricAttrs);
genAiOperationDurationHistogram.record(
evt.durationMs / 1000,
genAiModelCallMetricAttrs(evt, errorType),
);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
"openclaw.provider": evt.provider,
"openclaw.model": evt.model,
"openclaw.errorCategory": errorType,
"error.type": errorType,
};
if (evt.failureKind) {
spanAttrs["openclaw.failureKind"] = lowCardinalityAttr(evt.failureKind, "other");
}
assignGenAiModelCallAttrs(spanAttrs, evt);
if (evt.api) {
spanAttrs["openclaw.api"] = evt.api;
}
if (evt.transport) {
spanAttrs["openclaw.transport"] = evt.transport;
}
assignModelCallSizeTimingAttrs(spanAttrs, evt);
assignOtelModelContentAttributes(spanAttrs, modelContent, contentCapturePolicy);
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration(modelCallSpanName(evt), spanAttrs, evt.durationMs, {
kind: modelCallSpanKind(),
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
addUpstreamRequestIdSpanEvent(span, evt.upstreamRequestIdHash);
span.setStatus({
code: SpanStatusCode.ERROR,
message: redactSensitiveText(evt.errorCategory),
});
span.end(evt.ts);
};
const toolExecutionBaseAttrs = (
evt: Extract<
DiagnosticEventPayload,
{
type:
| "tool.execution.started"
| "tool.execution.completed"
| "tool.execution.error"
| "tool.execution.blocked";
}
>,
): Record<string, string | number | boolean> => ({
"openclaw.toolName": evt.toolName,
"openclaw.tool.source": lowCardinalityAttr(evt.toolSource, "core"),
"gen_ai.tool.name": evt.toolName,
...(evt.toolOwner ? { "openclaw.tool.owner": lowCardinalityAttr(evt.toolOwner) } : {}),
...paramsSummaryAttrs(evt.paramsSummary),
});
const skillUsedAttrs = (
evt: Extract<DiagnosticEventPayload, { type: "skill.used" }>,
): Record<string, string | number | boolean> => ({
"openclaw.skill.name": lowCardinalityAttr(evt.skillName, "skill"),
"openclaw.skill.source": lowCardinalityAttr(evt.skillSource),
"openclaw.skill.activation": lowCardinalityAttr(evt.activation),
...(evt.agentId ? { "openclaw.agent": lowCardinalityAttr(evt.agentId) } : {}),
...(evt.toolName ? { "openclaw.toolName": lowCardinalityAttr(evt.toolName, "tool") } : {}),
});
const recordSkillUsed = (
evt: Extract<DiagnosticEventPayload, { type: "skill.used" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!metadata.trusted) {
return;
}
const attrs = skillUsedAttrs(evt);
skillUsedCounter.add(1, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = { ...attrs };
addRunAttrs(spanAttrs, evt);
const span = spanWithDuration("openclaw.skill.used", spanAttrs, 0, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.end(evt.ts);
};
const recordToolExecutionStarted = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.started" }>,
metadata: DiagnosticEventMetadata,
) => {
if (!tracesEnabled || !metadata.trusted) {
return;
}
trackTrustedSpan(
evt,
metadata,
spanWithDuration("openclaw.tool.execution", toolExecutionBaseAttrs(evt), undefined, {
parentContext: activeTrustedParentContext(evt, metadata),
startTimeMs: evt.ts,
}),
);
};
const recordToolExecutionCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.completed" }>,
metadata: DiagnosticEventMetadata,
) => {
const attrs = toolExecutionBaseAttrs(evt);
toolExecutionDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...toolExecutionBaseAttrs(evt),
};
addRunAttrs(spanAttrs, evt);
assignOtelToolContentAttributes(
spanAttrs,
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.end(evt.ts);
};
const recordToolExecutionError = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.error" }>,
metadata: DiagnosticEventMetadata,
) => {
const attrs = {
...toolExecutionBaseAttrs(evt),
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
};
toolExecutionDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...toolExecutionBaseAttrs(evt),
"openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other"),
};
addRunAttrs(spanAttrs, evt);
if (evt.errorCode) {
spanAttrs["openclaw.errorCode"] = lowCardinalityAttr(evt.errorCode, "other");
}
assignOtelToolContentAttributes(
spanAttrs,
evt as unknown as Record<string, unknown>,
contentCapturePolicy,
);
const span =
takeTrackedTrustedSpan(evt, metadata) ??
spanWithDuration("openclaw.tool.execution", spanAttrs, evt.durationMs, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.setStatus({
code: SpanStatusCode.ERROR,
message: redactSensitiveText(evt.errorCategory),
});
span.end(evt.ts);
};
const recordToolExecutionBlocked = (
evt: Extract<DiagnosticEventPayload, { type: "tool.execution.blocked" }>,
metadata: DiagnosticEventMetadata,
) => {
toolExecutionBlockedCounter.add(1, {
...toolExecutionBaseAttrs(evt),
"openclaw.deniedReason": lowCardinalityAttr(evt.deniedReason, "other"),
});
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...toolExecutionBaseAttrs(evt),
"openclaw.outcome": "blocked",
"openclaw.deniedReason": lowCardinalityAttr(evt.deniedReason, "other"),
};
addRunAttrs(spanAttrs, evt);
const span = spanWithDuration("openclaw.tool.execution", spanAttrs, 0, {
parentContext: activeTrustedParentContext(evt, metadata),
endTimeMs: evt.ts,
});
setSpanAttrs(span, spanAttrs);
span.end(evt.ts);
};
const recordPayloadLarge = (
evt: Extract<DiagnosticEventPayload, { type: "payload.large" }>,
) => {
const attrs = {
"openclaw.payload.action": evt.action,
"openclaw.payload.surface": lowCardinalityAttr(evt.surface, "unknown"),
"openclaw.channel": lowCardinalityAttr(evt.channel, "none"),
"openclaw.plugin": lowCardinalityAttr(evt.pluginId, "none"),
"openclaw.reason": lowCardinalityAttr(evt.reason, "none"),
};
payloadLargeCounter.add(1, attrs);
const bytes = positiveFiniteNumber(evt.bytes);
if (bytes !== undefined) {
payloadLargeBytesHistogram.record(bytes, attrs);
}
};
const recordExecProcessCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "exec.process.completed" }>,
) => {
const attrs: Record<string, string | number> = {
"openclaw.exec.target": evt.target,
"openclaw.exec.mode": evt.mode,
"openclaw.outcome": evt.outcome,
};
if (evt.failureKind) {
attrs["openclaw.failureKind"] = evt.failureKind;
}
execProcessDurationHistogram.record(evt.durationMs, attrs);
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number | boolean> = {
...attrs,
"openclaw.exec.command_length": evt.commandLength,
};
if (typeof evt.exitCode === "number") {
spanAttrs["openclaw.exec.exit_code"] = evt.exitCode;
}
if (evt.exitSignal) {
spanAttrs["openclaw.exec.exit_signal"] = lowCardinalityAttr(evt.exitSignal, "other");
}
if (evt.timedOut !== undefined) {
spanAttrs["openclaw.exec.timed_out"] = evt.timedOut;
}
const span = spanWithDuration("openclaw.exec", spanAttrs, evt.durationMs, {
endTimeMs: evt.ts,
});
if (evt.outcome === "failed") {
span.setStatus({
code: SpanStatusCode.ERROR,
...(evt.failureKind ? { message: evt.failureKind } : {}),
});
}
span.end(evt.ts);
};
const recordHeartbeat = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.heartbeat" }>,
) => {
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "heartbeat" });
};
const recordLivenessWarning = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.liveness.warning" }>,
) => {
const reason = evt.reasons.join(":");
const attrs = {
"openclaw.liveness.reason": lowCardinalityAttr(reason, "unknown"),
};
livenessWarningCounter.add(1, attrs);
queueDepthHistogram.record(evt.queued, { "openclaw.channel": "liveness" });
if (evt.eventLoopDelayP99Ms !== undefined) {
livenessEventLoopDelayP99Histogram.record(evt.eventLoopDelayP99Ms, attrs);
}
if (evt.eventLoopDelayMaxMs !== undefined) {
livenessEventLoopDelayMaxHistogram.record(evt.eventLoopDelayMaxMs, attrs);
}
if (evt.eventLoopUtilization !== undefined) {
livenessEventLoopUtilizationHistogram.record(evt.eventLoopUtilization, attrs);
}
if (evt.cpuCoreRatio !== undefined) {
livenessCpuCoreRatioHistogram.record(evt.cpuCoreRatio, attrs);
}
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = {
...attrs,
"openclaw.liveness.active": evt.active,
"openclaw.liveness.waiting": evt.waiting,
"openclaw.liveness.queued": evt.queued,
"openclaw.liveness.interval_ms": evt.intervalMs,
...(evt.eventLoopDelayP99Ms !== undefined
? { "openclaw.liveness.event_loop_delay_p99_ms": evt.eventLoopDelayP99Ms }
: {}),
...(evt.eventLoopDelayMaxMs !== undefined
? { "openclaw.liveness.event_loop_delay_max_ms": evt.eventLoopDelayMaxMs }
: {}),
...(evt.eventLoopUtilization !== undefined
? { "openclaw.liveness.event_loop_utilization": evt.eventLoopUtilization }
: {}),
...(evt.cpuUserMs !== undefined
? { "openclaw.liveness.cpu_user_ms": evt.cpuUserMs }
: {}),
...(evt.cpuSystemMs !== undefined
? { "openclaw.liveness.cpu_system_ms": evt.cpuSystemMs }
: {}),
...(evt.cpuTotalMs !== undefined
? { "openclaw.liveness.cpu_total_ms": evt.cpuTotalMs }
: {}),
...(evt.cpuCoreRatio !== undefined
? { "openclaw.liveness.cpu_core_ratio": evt.cpuCoreRatio }
: {}),
};
const span = spanWithDuration("openclaw.liveness.warning", spanAttrs, 0, {
endTimeMs: evt.ts,
});
span.setStatus({
code: SpanStatusCode.ERROR,
message: reason,
});
span.end(evt.ts);
};
const recordDiagnosticPhaseCompleted = (
evt: Extract<DiagnosticEventPayload, { type: "diagnostic.phase.completed" }>,
) => {
if (!tracesEnabled) {
return;
}
const spanAttrs: Record<string, string | number> = {
"openclaw.phase": lowCardinalityAttr(evt.name, "unknown"),
...(evt.cpuUserMs !== undefined ? { "openclaw.phase.cpu_user_ms": evt.cpuUserMs } : {}),
...(evt.cpuSystemMs !== undefined
? { "openclaw.phase.cpu_system_ms": evt.cpuSystemMs }
: {}),
...(evt.cpuTotalMs !== undefined
? { "openclaw.phase.cpu_total_ms": evt.cpuTotalMs }
: {}),
...(evt.cpuCoreRatio !== undefined
? { "openclaw.phase.cpu_core_ratio": evt.cpuCoreRatio }
: {}),
};
for (const [key, value] of Object.entries(evt.details ?? {})) {
spanAttrs[`openclaw.phase.detail.${key}`] =
typeof value === "boolean" ? String(value) : value;
}
const span = spanWithDuration("openclaw.diagnostic.phase", spanAttrs, evt.durationMs, {
endTimeMs: evt.ts,
});
span.end(evt.ts);
};
const recordTelemetryExporter = (
evt: TelemetryExporterDiagnosticEvent,
metadata: DiagnosticEventMetadata,
) => {
if (!metadata.trusted) {
return;
}
telemetryExporterCounter.add(1, {
"openclaw.exporter": lowCardinalityAttr(evt.exporter, "unknown"),
"openclaw.signal": evt.signal,
"openclaw.status": evt.status,
...(evt.reason ? { "openclaw.reason": evt.reason } : {}),
...(evt.errorCategory
? { "openclaw.errorCategory": lowCardinalityAttr(evt.errorCategory, "other") }
: {}),
});
};
const subscribe = ctx.internalDiagnostics?.onEvent;
if (!subscribe) {
ctx.logger.error("diagnostics-otel: internal diagnostics capability unavailable");
return;
}
unsubscribe = subscribe((evt, metadata, privateData) => {
try {
switch (evt.type) {
case "model.usage":
recordModelUsage(evt, metadata);
return;
case "webhook.received":
recordWebhookReceived(evt);
return;
case "webhook.processed":
recordWebhookProcessed(evt);
return;
case "webhook.error":
recordWebhookError(evt);
return;
case "message.queued":
recordMessageQueued(evt);
return;
case "message.received":
recordMessageReceived(evt);
return;
case "message.dispatch.started":
recordMessageDispatchStarted(evt);
return;
case "message.dispatch.completed":
recordMessageDispatchCompleted(evt);
return;
case "message.processed":
recordMessageProcessed(evt);
return;
case "message.delivery.started":
recordMessageDeliveryStarted(evt);
return;
case "message.delivery.completed":
recordMessageDeliveryCompleted(evt);
return;
case "message.delivery.error":
recordMessageDeliveryError(evt);
return;
case "talk.event":
recordTalkEvent(evt, metadata);
return;
case "queue.lane.enqueue":
recordLaneEnqueue(evt);
return;
case "queue.lane.dequeue":
recordLaneDequeue(evt);
return;
case "session.state":
recordSessionState(evt);
return;
case "session.long_running":
case "session.stalled":
return;
case "session.turn.created":
recordSessionTurnCreated(evt);
return;
case "session.stuck":
recordSessionStuck(evt);
return;
case "session.recovery.requested":
recordSessionRecoveryRequested(evt);
return;
case "session.recovery.completed":
recordSessionRecoveryCompleted(evt);
return;
case "run.attempt":
recordRunAttempt(evt);
return;
case "run.progress":
return;
case "diagnostic.heartbeat":
recordHeartbeat(evt);
return;
case "diagnostic.liveness.warning":
recordLivenessWarning(evt);
return;
case "diagnostic.phase.completed":
recordDiagnosticPhaseCompleted(evt);
return;
case "run.started":
recordRunStarted(evt, metadata);
return;
case "run.completed":
recordRunCompleted(evt, metadata);
return;
case "harness.run.started":
recordHarnessRunStarted(evt, metadata);
return;
case "harness.run.completed":
recordHarnessRunCompleted(evt, metadata);
return;
case "harness.run.error":
recordHarnessRunError(evt, metadata);
return;
case "context.assembled":
recordContextAssembled(evt, metadata);
return;
case "model.call.started":
recordModelCallStarted(evt, metadata);
return;
case "model.call.completed":
recordModelCallCompleted(evt, metadata, privateData.modelContent);
return;
case "model.call.error":
recordModelCallError(evt, metadata, privateData.modelContent);
return;
case "tool.execution.started":
recordToolExecutionStarted(evt, metadata);
return;
case "tool.execution.completed":
recordToolExecutionCompleted(evt, metadata);
return;
case "tool.execution.error":
recordToolExecutionError(evt, metadata);
return;
case "tool.execution.blocked":
recordToolExecutionBlocked(evt, metadata);
return;
case "skill.used":
recordSkillUsed(evt, metadata);
return;
case "exec.process.completed":
recordExecProcessCompleted(evt);
return;
case "log.record":
recordLogRecord?.(evt, metadata);
return;
case "tool.loop":
recordToolLoop(evt);
return;
case "diagnostic.memory.sample":
recordMemorySample(evt);
return;
case "diagnostic.memory.pressure":
recordMemoryPressure(evt);
return;
case "diagnostic.async_queue.dropped":
recordAsyncQueueDropped(evt);
return;
case "telemetry.exporter":
recordTelemetryExporter(evt, metadata);
return;
case "payload.large":
recordPayloadLarge(evt);
return;
case "model.failover":
recordModelFailover(evt, metadata);
return;
}
} catch (err) {
ctx.logger.error(
`diagnostics-otel: event handler failed (${evt.type}): ${formatError(err)}`,
);
}
});
unregisterUnhandledRejectionHandler = registerUnhandledRejectionHandler((reason) => {
const otlpError = findOtlpExporterError(reason);
if (!otlpError) {
return false;
}
const code = readErrorCode(otlpError) ?? "unknown";
ctx.logger.warn(
`diagnostics-otel: suppressed OTLP exporter unhandled rejection (code=${String(code)})`,
);
return true;
});
emitForSignals(enabledSignals, {
exporter: "diagnostics-otel",
status: "started",
reason: "configured",
});
if (logsEnabled) {
ctx.logger.info("diagnostics-otel: logs exporter enabled (OTLP/Protobuf)");
}
},
async stop() {
await stopStarted();
},
} satisfies OpenClawPluginService;
}