feat(diagnostics-prometheus): add protected metrics exporter

This commit is contained in:
Vincent Koc
2026-04-26 01:05:45 -07:00
parent 6cd047e7c2
commit 0f2e7510cb
19 changed files with 1062 additions and 7 deletions

4
.github/labeler.yml vendored
View File

@@ -233,6 +233,10 @@
- changed-files:
- any-glob-to-any-file:
- "extensions/diagnostics-otel/**"
"extensions: diagnostics-prometheus":
- changed-files:
- any-glob-to-any-file:
- "extensions/diagnostics-prometheus/**"
"extensions: llm-task":
- changed-files:
- any-glob-to-any-file:

View File

@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
- Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
- Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc.
- Diagnostics/trace: propagate W3C `traceparent` headers from trusted model-call trace context to provider transports while replacing caller-supplied traceparent values. Thanks @vincentkoc.
- Diagnostics/Prometheus: add a bundled `diagnostics-prometheus` plugin with a protected gateway scrape route for low-cardinality diagnostics metrics. Thanks @vincentkoc.
- Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
- Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
- Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.

View File

@@ -1442,6 +1442,7 @@
"gateway/doctor",
"logging",
"gateway/opentelemetry",
"gateway/prometheus",
"gateway/logging",
"gateway/diagnostics",
"gateway/troubleshooting"

View File

@@ -0,0 +1,89 @@
---
summary: "Expose OpenClaw diagnostics as Prometheus text metrics through the diagnostics-prometheus plugin"
title: "Prometheus metrics"
read_when:
- You want Prometheus, Grafana, VictoriaMetrics, or another scraper to collect OpenClaw Gateway metrics
- You need the Prometheus metric names and label policy for dashboards or alerts
- You want metrics without running an OpenTelemetry collector
---
OpenClaw can expose diagnostics metrics through the bundled
`diagnostics-prometheus` plugin. It listens to trusted internal diagnostics and
renders a Prometheus text endpoint at:
```text
/api/diagnostics/prometheus
```
The route uses Gateway authentication. Do not expose it as a public
unauthenticated `/metrics` endpoint.
## Quick start
```json5
{
plugins: {
allow: ["diagnostics-prometheus"],
entries: {
"diagnostics-prometheus": { enabled: true },
},
},
diagnostics: {
enabled: true,
},
}
```
You can also enable the plugin from the CLI:
```bash
openclaw plugins enable diagnostics-prometheus
```
Then scrape the protected Gateway route with the same Gateway authentication you
use for operator APIs.
## Metrics exported
| Metric | Type | Labels |
| --------------------------------------------- | --------- | ----------------------------------------------------------------------------------------- |
| `openclaw_run_completed_total` | counter | `channel`, `model`, `outcome`, `provider`, `trigger` |
| `openclaw_run_duration_seconds` | histogram | `channel`, `model`, `outcome`, `provider`, `trigger` |
| `openclaw_model_call_total` | counter | `api`, `error_category`, `model`, `outcome`, `provider`, `transport` |
| `openclaw_model_call_duration_seconds` | histogram | `api`, `error_category`, `model`, `outcome`, `provider`, `transport` |
| `openclaw_model_tokens_total` | counter | `agent`, `channel`, `model`, `provider`, `token_type` |
| `openclaw_gen_ai_client_token_usage` | histogram | `model`, `provider`, `token_type` |
| `openclaw_model_cost_usd_total` | counter | `agent`, `channel`, `model`, `provider` |
| `openclaw_tool_execution_total` | counter | `error_category`, `outcome`, `params_kind`, `tool` |
| `openclaw_tool_execution_duration_seconds` | histogram | `error_category`, `outcome`, `params_kind`, `tool` |
| `openclaw_harness_run_total` | counter | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` |
| `openclaw_harness_run_duration_seconds` | histogram | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` |
| `openclaw_message_processed_total` | counter | `channel`, `outcome`, `reason` |
| `openclaw_message_processed_duration_seconds` | histogram | `channel`, `outcome`, `reason` |
| `openclaw_message_delivery_total` | counter | `channel`, `delivery_kind`, `error_category`, `outcome` |
| `openclaw_message_delivery_duration_seconds` | histogram | `channel`, `delivery_kind`, `error_category`, `outcome` |
| `openclaw_queue_lane_size` | gauge | `lane` |
| `openclaw_queue_lane_wait_seconds` | histogram | `lane` |
| `openclaw_session_state_total` | counter | `reason`, `state` |
| `openclaw_session_queue_depth` | gauge | `state` |
| `openclaw_memory_bytes` | gauge | `kind` |
| `openclaw_memory_rss_bytes` | histogram | none |
| `openclaw_memory_pressure_total` | counter | `level`, `reason` |
| `openclaw_telemetry_exporter_total` | counter | `exporter`, `reason`, `signal`, `status` |
| `openclaw_prometheus_series_dropped_total` | counter | none |
## Label policy
Prometheus labels stay bounded and low-cardinality. The exporter does not emit
raw diagnostic identifiers such as `runId`, `sessionKey`, `sessionId`, `callId`,
`toolCallId`, message IDs, chat IDs, or provider request IDs.
Label values are redacted and must match OpenClaw's low-cardinality character
policy. Values that fail the policy are replaced with `unknown`, `other`, or
`none`, depending on the metric.
The exporter caps retained time series in memory. If the cap is reached, new
series are dropped and `openclaw_prometheus_series_dropped_total` increments.
For full traces, logs, OTLP export, and OpenTelemetry GenAI semantic attributes,
use [OpenTelemetry export](/gateway/opentelemetry).

View File

@@ -420,8 +420,9 @@ The same rule applies to other bundled-helper families such as:
`plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`,
`plugin-sdk/twitch`,
`plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`,
`plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`,
`plugin-sdk/thread-ownership`, and `plugin-sdk/voice-call`
`plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`,
`plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`,
and `plugin-sdk/voice-call`
`plugin-sdk/github-copilot-token` currently exposes the narrow token-helper
surface `DEFAULT_COPILOT_API_BASE_URL`,

View File

@@ -271,7 +271,7 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview)
| Line | `plugin-sdk/line`, `plugin-sdk/line-core`, `plugin-sdk/line-runtime`, `plugin-sdk/line-surface` | Bundled LINE helper/runtime surface |
| IRC | `plugin-sdk/irc`, `plugin-sdk/irc-surface` | Bundled IRC helper surface |
| Channel-specific helpers | `plugin-sdk/googlechat`, `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles`, `plugin-sdk/bluebubbles-policy`, `plugin-sdk/mattermost`, `plugin-sdk/mattermost-policy`, `plugin-sdk/feishu-conversation`, `plugin-sdk/msteams`, `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch` | Bundled channel compatibility/helper seams |
| Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` |
| Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` |
</Accordion>
</AccordionGroup>

View File

@@ -0,0 +1 @@
export * from "openclaw/plugin-sdk/diagnostics-prometheus";

View File

@@ -0,0 +1,20 @@
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
import { createDiagnosticsPrometheusExporter } from "./src/service.js";
const exporter = createDiagnosticsPrometheusExporter();
export default definePluginEntry({
id: "diagnostics-prometheus",
name: "Diagnostics Prometheus",
description: "Expose OpenClaw diagnostics metrics in Prometheus text format",
register(api) {
api.registerService(exporter.service);
api.registerHttpRoute({
path: "/api/diagnostics/prometheus",
auth: "gateway",
match: "exact",
gatewayRuntimeScopeSurface: "trusted-operator",
handler: exporter.handler,
});
},
});

View File

@@ -0,0 +1,8 @@
{
"id": "diagnostics-prometheus",
"configSchema": {
"type": "object",
"additionalProperties": false,
"properties": {}
}
}

View File

@@ -0,0 +1,24 @@
{
"name": "@openclaw/diagnostics-prometheus",
"version": "2026.4.25",
"description": "OpenClaw diagnostics Prometheus exporter",
"type": "module",
"devDependencies": {
"@openclaw/plugin-sdk": "workspace:*"
},
"openclaw": {
"extensions": [
"./index.ts"
],
"compat": {
"pluginApi": ">=2026.4.25"
},
"build": {
"openclawVersion": "2026.4.25"
},
"release": {
"publishToClawHub": true,
"publishToNpm": true
}
}
}

View File

@@ -0,0 +1,169 @@
import { describe, expect, it, vi } from "vitest";
import type { DiagnosticEventMetadata, DiagnosticEventPayload } from "../api.js";
import { createDiagnosticsPrometheusExporter, __test__ } from "./service.js";
const trusted: DiagnosticEventMetadata = Object.freeze({ trusted: true });
const untrusted: DiagnosticEventMetadata = Object.freeze({ trusted: false });
function baseEvent(): Pick<DiagnosticEventPayload, "seq" | "ts"> {
return { seq: 1, ts: 1700000000000 };
}
describe("diagnostics-prometheus service", () => {
it("records trusted run metrics without raw diagnostic identifiers", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "run.completed",
runId: "run-should-not-export",
sessionKey: "session-should-not-export",
provider: "openai",
model: "gpt-5.4",
channel: "discord",
trigger: "message",
durationMs: 1500,
outcome: "completed",
},
trusted,
);
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain("# TYPE openclaw_run_completed_total counter");
expect(rendered).toContain(
'openclaw_run_completed_total{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1',
);
expect(rendered).toContain(
'openclaw_run_duration_seconds_sum{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1.5',
);
expect(rendered).not.toContain("run-should-not-export");
expect(rendered).not.toContain("session-should-not-export");
});
it("drops untrusted plugin-emitted diagnostic events", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "model.call.completed",
runId: "run-1",
callId: "call-1",
provider: "openai",
model: "gpt-5.4",
durationMs: 10,
},
untrusted,
);
expect(__test__.renderPrometheusMetrics(store)).toBe("");
});
it("redacts and bounds label values", () => {
const store = __test__.createPrometheusMetricStore();
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "tool.execution.error",
toolName: "shell\nbad",
durationMs: 25,
errorCategory: "Bearer sk-secret-token-value",
},
trusted,
);
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain(
'openclaw_tool_execution_total{error_category="other",outcome="error",params_kind="unknown",tool="tool"} 1',
);
expect(rendered).not.toContain("Bearer");
expect(rendered).not.toContain("sk-secret");
});
it("caps metric series growth and reports dropped series", () => {
const store = __test__.createPrometheusMetricStore();
for (let index = 0; index < 2100; index += 1) {
__test__.recordDiagnosticEvent(
store,
{
...baseEvent(),
type: "model.call.completed",
runId: `run-${index}`,
callId: `call-${index}`,
provider: "openai",
model: `model.${index}`,
durationMs: 10,
},
trusted,
);
}
const rendered = __test__.renderPrometheusMetrics(store);
expect(rendered).toContain("# TYPE openclaw_prometheus_series_dropped_total counter");
expect(rendered).toContain("openclaw_prometheus_series_dropped_total ");
});
it("subscribes to internal diagnostics and renders scrape text", () => {
const listeners: Array<
(event: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => void
> = [];
const emitted: unknown[] = [];
const exporter = createDiagnosticsPrometheusExporter();
const unsubscribe = vi.fn();
exporter.service.start({
config: {} as never,
stateDir: "/tmp/openclaw-prometheus-test",
logger: {
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
debug: vi.fn(),
},
internalDiagnostics: {
emit: (event) => emitted.push(event),
onEvent: (listener) => {
listeners.push(listener);
return unsubscribe;
},
},
});
listeners[0]?.(
{
...baseEvent(),
type: "model.usage",
provider: "openai",
model: "gpt-5.4",
usage: { input: 12, output: 3, total: 15 },
},
trusted,
);
expect(emitted).toContainEqual(
expect.objectContaining({
type: "telemetry.exporter",
exporter: "diagnostics-prometheus",
signal: "metrics",
status: "started",
}),
);
expect(exporter.render()).toContain(
'openclaw_model_tokens_total{agent="unknown",channel="unknown",model="gpt-5.4",provider="openai",token_type="input"} 12',
);
exporter.service.stop?.();
expect(unsubscribe).toHaveBeenCalledOnce();
expect(exporter.render()).toBe("");
});
});

View File

@@ -0,0 +1,684 @@
import type { IncomingMessage, ServerResponse } from "node:http";
import type {
DiagnosticEventMetadata,
DiagnosticEventPayload,
OpenClawPluginHttpRouteHandler,
OpenClawPluginService,
} from "../api.js";
import { redactSensitiveText } from "../api.js";
type LabelSet = Record<string, string>;
type CounterSample = {
help: string;
labels: LabelSet;
value: number;
};
type HistogramSample = {
buckets: number[];
counts: number[];
count: number;
help: string;
labels: LabelSet;
sum: number;
};
type GaugeSample = {
help: string;
labels: LabelSet;
value: number;
};
type MetricSnapshot = {
counters: Map<string, CounterSample>;
gauges: Map<string, GaugeSample>;
histograms: Map<string, HistogramSample>;
};
type PrometheusMetricStore = ReturnType<typeof createPrometheusMetricStore>;
const DURATION_BUCKETS_SECONDS = [
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600,
];
const TOKEN_BUCKETS = [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576];
const BYTE_BUCKETS = [
1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864, 268435456, 1073741824,
4294967296, 17179869184,
];
const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u;
const MAX_PROMETHEUS_SERIES = 2048;
const DROPPED_SERIES_COUNTER_NAME = "openclaw_prometheus_series_dropped_total";
function lowCardinalityLabel(value: string | undefined, fallback = "unknown"): string {
if (!value) {
return fallback;
}
const redacted = redactSensitiveText(value.trim());
return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback;
}
function numericValue(value: number | undefined): number | undefined {
return typeof value === "number" && Number.isFinite(value) && value >= 0 ? value : undefined;
}
function seconds(ms: number | undefined): number | undefined {
const value = numericValue(ms);
return value === undefined ? undefined : value / 1000;
}
function sortedLabels(labels: LabelSet): [string, string][] {
return Object.entries(labels).toSorted(([left], [right]) => left.localeCompare(right));
}
function metricKey(name: string, labels: LabelSet): string {
return `${name}|${JSON.stringify(sortedLabels(labels))}`;
}
function escapeHelp(value: string): string {
return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n");
}
function escapeLabelValue(value: string): string {
return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, '\\"');
}
function formatLabels(labels: LabelSet): string {
const entries = sortedLabels(labels);
if (entries.length === 0) {
return "";
}
return `{${entries.map(([key, value]) => `${key}="${escapeLabelValue(value)}"`).join(",")}}`;
}
function formatPrometheusNumber(value: number): string {
if (!Number.isFinite(value)) {
return "0";
}
return Number.isInteger(value) ? String(value) : String(Number(value.toPrecision(12)));
}
function createPrometheusMetricStore() {
const counters = new Map<string, CounterSample>();
const gauges = new Map<string, GaugeSample>();
const histograms = new Map<string, HistogramSample>();
let droppedSeries = 0;
const canCreateSeries = <T>(map: Map<string, T>, key: string, metricName: string): boolean => {
if (map.has(key)) {
return true;
}
if (metricName === DROPPED_SERIES_COUNTER_NAME) {
return true;
}
if (counters.size + gauges.size + histograms.size < MAX_PROMETHEUS_SERIES) {
return true;
}
droppedSeries += 1;
return false;
};
const counter = (name: string, help: string, labels: LabelSet, amount = 1) => {
if (!Number.isFinite(amount) || amount <= 0) {
return;
}
const key = metricKey(name, labels);
if (!canCreateSeries(counters, key, name)) {
return;
}
const existing = counters.get(key);
if (existing) {
existing.value += amount;
return;
}
counters.set(key, { help, labels, value: amount });
};
const gauge = (name: string, help: string, labels: LabelSet, value: number | undefined) => {
if (value === undefined || !Number.isFinite(value)) {
return;
}
const key = metricKey(name, labels);
if (!canCreateSeries(gauges, key, name)) {
return;
}
gauges.set(key, { help, labels, value });
};
const histogram = (
name: string,
help: string,
labels: LabelSet,
value: number | undefined,
buckets = DURATION_BUCKETS_SECONDS,
) => {
if (value === undefined || !Number.isFinite(value) || value < 0) {
return;
}
const key = metricKey(name, labels);
if (!canCreateSeries(histograms, key, name)) {
return;
}
let sample = histograms.get(key);
if (!sample) {
sample = {
buckets,
counts: buckets.map(() => 0),
count: 0,
help,
labels,
sum: 0,
};
histograms.set(key, sample);
}
sample.count += 1;
sample.sum += value;
for (let index = 0; index < sample.buckets.length; index += 1) {
const bucket = sample.buckets[index];
if (bucket !== undefined && value <= bucket) {
sample.counts[index] = (sample.counts[index] ?? 0) + 1;
}
}
};
const snapshot = (): MetricSnapshot => {
const counterSnapshot = new Map(counters);
if (droppedSeries > 0) {
counterSnapshot.set(metricKey(DROPPED_SERIES_COUNTER_NAME, {}), {
help: "Prometheus metric series dropped because the exporter series cap was reached.",
labels: {},
value: droppedSeries,
});
}
return {
counters: counterSnapshot,
gauges: new Map(gauges),
histograms: new Map(histograms),
};
};
const reset = () => {
counters.clear();
gauges.clear();
histograms.clear();
droppedSeries = 0;
};
return { counter, gauge, histogram, reset, snapshot };
}
function safeErrorMessage(err: unknown): string {
const message = err instanceof Error ? (err.message ?? err.name) : String(err);
return redactSensitiveText(message)
.replaceAll("\u0000", " ")
.replace(/[\r\n\t\u2028\u2029]/gu, " ")
.slice(0, 500);
}
function renderPrometheusMetrics(store: PrometheusMetricStore): string {
const snapshot = store.snapshot();
const lines: string[] = [];
const emitted = new Set<string>();
const emitHeader = (name: string, type: "counter" | "gauge" | "histogram", help: string) => {
if (emitted.has(name)) {
return;
}
emitted.add(name);
lines.push(`# HELP ${name} ${escapeHelp(help)}`);
lines.push(`# TYPE ${name} ${type}`);
};
const counterEntries = [...snapshot.counters.entries()].toSorted(([left], [right]) =>
left.localeCompare(right),
);
for (const [key, sample] of counterEntries) {
const name = key.split("|", 1)[0] ?? "";
emitHeader(name, "counter", sample.help);
lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`);
}
const gaugeEntries = [...snapshot.gauges.entries()].toSorted(([left], [right]) =>
left.localeCompare(right),
);
for (const [key, sample] of gaugeEntries) {
const name = key.split("|", 1)[0] ?? "";
emitHeader(name, "gauge", sample.help);
lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`);
}
const histogramEntries = [...snapshot.histograms.entries()].toSorted(([left], [right]) =>
left.localeCompare(right),
);
for (const [key, sample] of histogramEntries) {
const name = key.split("|", 1)[0] ?? "";
emitHeader(name, "histogram", sample.help);
for (let index = 0; index < sample.buckets.length; index += 1) {
const bucket = sample.buckets[index];
if (bucket === undefined) {
continue;
}
lines.push(
`${name}_bucket${formatLabels({ ...sample.labels, le: String(bucket) })} ${formatPrometheusNumber(sample.counts[index] ?? 0)}`,
);
}
lines.push(
`${name}_bucket${formatLabels({ ...sample.labels, le: "+Inf" })} ${formatPrometheusNumber(sample.count)}`,
);
lines.push(`${name}_sum${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.sum)}`);
lines.push(
`${name}_count${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.count)}`,
);
}
lines.push("");
return lines.join("\n");
}
function runLabels(evt: {
channel?: string;
model?: string;
outcome?: string;
provider?: string;
trigger?: string;
}): LabelSet {
return {
channel: lowCardinalityLabel(evt.channel),
model: lowCardinalityLabel(evt.model),
outcome: lowCardinalityLabel(evt.outcome, "unknown"),
provider: lowCardinalityLabel(evt.provider),
trigger: lowCardinalityLabel(evt.trigger),
};
}
function modelCallLabels(evt: {
api?: string;
errorCategory?: string;
model?: string;
provider?: string;
transport?: string;
type: string;
}): LabelSet {
return {
api: lowCardinalityLabel(evt.api),
error_category:
evt.type === "model.call.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none",
model: lowCardinalityLabel(evt.model),
outcome: evt.type === "model.call.error" ? "error" : "completed",
provider: lowCardinalityLabel(evt.provider),
transport: lowCardinalityLabel(evt.transport),
};
}
function toolExecutionLabels(evt: {
errorCategory?: string;
paramsSummary?: { kind: string };
toolName: string;
type: string;
}): LabelSet {
return {
error_category:
evt.type === "tool.execution.error"
? lowCardinalityLabel(evt.errorCategory, "other")
: "none",
outcome: evt.type === "tool.execution.error" ? "error" : "completed",
params_kind: lowCardinalityLabel(evt.paramsSummary?.kind),
tool: lowCardinalityLabel(evt.toolName, "tool"),
};
}
function harnessLabels(evt: {
channel?: string;
errorCategory?: string;
harnessId: string;
model?: string;
outcome?: string;
phase?: string;
pluginId?: string;
provider?: string;
type: string;
}): LabelSet {
return {
channel: lowCardinalityLabel(evt.channel),
error_category:
evt.type === "harness.run.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none",
harness: lowCardinalityLabel(evt.harnessId),
model: lowCardinalityLabel(evt.model),
outcome: evt.type === "harness.run.error" ? "error" : lowCardinalityLabel(evt.outcome),
phase: evt.type === "harness.run.error" ? lowCardinalityLabel(evt.phase) : "none",
plugin: lowCardinalityLabel(evt.pluginId),
provider: lowCardinalityLabel(evt.provider),
};
}
function recordModelUsage(
store: PrometheusMetricStore,
evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
) {
const labels = {
agent: lowCardinalityLabel(evt.agentId),
channel: lowCardinalityLabel(evt.channel),
model: lowCardinalityLabel(evt.model),
provider: lowCardinalityLabel(evt.provider),
};
const usage = evt.usage;
const recordTokens = (tokenType: string, value: number | undefined) => {
const amount = numericValue(value);
if (amount === undefined || amount === 0) {
return;
}
store.counter(
"openclaw_model_tokens_total",
"Model tokens reported by diagnostic usage events.",
{
...labels,
token_type: tokenType,
},
amount,
);
if (tokenType === "input" || tokenType === "output") {
store.histogram(
"openclaw_gen_ai_client_token_usage",
"GenAI token usage distribution for input and output tokens.",
{
model: labels.model,
provider: labels.provider,
token_type: tokenType,
},
amount,
TOKEN_BUCKETS,
);
}
};
recordTokens("input", usage.input);
recordTokens("output", usage.output);
recordTokens("cache_read", usage.cacheRead);
recordTokens("cache_write", usage.cacheWrite);
recordTokens("prompt", usage.promptTokens);
recordTokens("total", usage.total);
store.counter(
"openclaw_model_cost_usd_total",
"Estimated model cost in USD reported by diagnostic usage events.",
labels,
numericValue(evt.costUsd) ?? 0,
);
store.histogram(
"openclaw_model_usage_duration_seconds",
"Model usage event duration in seconds.",
labels,
seconds(evt.durationMs),
);
}
function recordDiagnosticEvent(
store: PrometheusMetricStore,
evt: DiagnosticEventPayload,
metadata: DiagnosticEventMetadata,
): void {
if (!metadata.trusted) {
return;
}
switch (evt.type) {
case "model.usage":
recordModelUsage(store, evt);
return;
case "run.completed":
store.histogram(
"openclaw_run_duration_seconds",
"Agent run duration in seconds.",
runLabels(evt),
seconds(evt.durationMs),
);
store.counter(
"openclaw_run_completed_total",
"Agent runs completed by outcome.",
runLabels(evt),
);
return;
case "model.call.completed":
case "model.call.error":
store.histogram(
"openclaw_model_call_duration_seconds",
"Provider model call duration in seconds.",
modelCallLabels(evt),
seconds(evt.durationMs),
);
store.counter(
"openclaw_model_call_total",
"Provider model calls completed by outcome.",
modelCallLabels(evt),
);
return;
case "tool.execution.completed":
case "tool.execution.error":
store.histogram(
"openclaw_tool_execution_duration_seconds",
"Tool execution duration in seconds.",
toolExecutionLabels(evt),
seconds(evt.durationMs),
);
store.counter(
"openclaw_tool_execution_total",
"Tool executions completed by outcome.",
toolExecutionLabels(evt),
);
return;
case "harness.run.completed":
case "harness.run.error":
store.histogram(
"openclaw_harness_run_duration_seconds",
"Agent harness run duration in seconds.",
harnessLabels(evt),
seconds(evt.durationMs),
);
store.counter(
"openclaw_harness_run_total",
"Agent harness runs completed by outcome.",
harnessLabels(evt),
);
return;
case "message.processed":
store.counter("openclaw_message_processed_total", "Inbound messages processed by outcome.", {
channel: lowCardinalityLabel(evt.channel),
outcome: evt.outcome,
reason: lowCardinalityLabel(evt.reason, "none"),
});
store.histogram(
"openclaw_message_processed_duration_seconds",
"Inbound message processing duration in seconds.",
{
channel: lowCardinalityLabel(evt.channel),
outcome: evt.outcome,
reason: lowCardinalityLabel(evt.reason, "none"),
},
seconds(evt.durationMs),
);
return;
case "message.delivery.completed":
case "message.delivery.error":
store.counter(
"openclaw_message_delivery_total",
"Outbound message delivery attempts by outcome.",
{
channel: lowCardinalityLabel(evt.channel),
delivery_kind: evt.deliveryKind,
error_category:
evt.type === "message.delivery.error"
? lowCardinalityLabel(evt.errorCategory, "other")
: "none",
outcome: evt.type === "message.delivery.error" ? "error" : "completed",
},
);
store.histogram(
"openclaw_message_delivery_duration_seconds",
"Outbound message delivery duration in seconds.",
{
channel: lowCardinalityLabel(evt.channel),
delivery_kind: evt.deliveryKind,
error_category:
evt.type === "message.delivery.error"
? lowCardinalityLabel(evt.errorCategory, "other")
: "none",
outcome: evt.type === "message.delivery.error" ? "error" : "completed",
},
seconds(evt.durationMs),
);
return;
case "queue.lane.enqueue":
case "queue.lane.dequeue":
store.gauge(
"openclaw_queue_lane_size",
"Current diagnostic queue lane size.",
{
lane: lowCardinalityLabel(evt.lane),
},
numericValue(evt.queueSize),
);
if (evt.type === "queue.lane.dequeue") {
store.histogram(
"openclaw_queue_lane_wait_seconds",
"Queue lane wait time in seconds.",
{ lane: lowCardinalityLabel(evt.lane) },
seconds(evt.waitMs),
);
}
return;
case "session.state":
store.counter("openclaw_session_state_total", "Session state observations.", {
reason: lowCardinalityLabel(evt.reason, "none"),
state: evt.state,
});
if (evt.queueDepth !== undefined) {
store.gauge(
"openclaw_session_queue_depth",
"Latest observed session queue depth.",
{
state: evt.state,
},
numericValue(evt.queueDepth),
);
}
return;
case "diagnostic.memory.sample":
store.gauge(
"openclaw_memory_bytes",
"Latest process memory usage by memory kind.",
{ kind: "rss" },
evt.memory.rssBytes,
);
store.gauge(
"openclaw_memory_bytes",
"Latest process memory usage by memory kind.",
{ kind: "heap_total" },
evt.memory.heapTotalBytes,
);
store.gauge(
"openclaw_memory_bytes",
"Latest process memory usage by memory kind.",
{ kind: "heap_used" },
evt.memory.heapUsedBytes,
);
store.histogram(
"openclaw_memory_rss_bytes",
"RSS memory sample distribution in bytes.",
{},
numericValue(evt.memory.rssBytes),
BYTE_BUCKETS,
);
return;
case "diagnostic.memory.pressure":
store.counter(
"openclaw_memory_pressure_total",
"Memory pressure events by level and reason.",
{
level: evt.level,
reason: evt.reason,
},
);
return;
case "telemetry.exporter":
store.counter("openclaw_telemetry_exporter_total", "Telemetry exporter lifecycle events.", {
exporter: lowCardinalityLabel(evt.exporter),
reason: lowCardinalityLabel(evt.reason, "none"),
signal: evt.signal,
status: evt.status,
});
return;
default:
return;
}
}
function createMetricsHandler(store: PrometheusMetricStore): OpenClawPluginHttpRouteHandler {
return (req: IncomingMessage, res: ServerResponse) => {
if (req.method !== "GET" && req.method !== "HEAD") {
res.statusCode = 405;
res.setHeader("Allow", "GET, HEAD");
res.end("Method Not Allowed");
return true;
}
const body = renderPrometheusMetrics(store);
res.statusCode = 200;
res.setHeader("Cache-Control", "no-store");
res.setHeader("Content-Type", "text/plain; version=0.0.4; charset=utf-8");
if (req.method === "HEAD") {
res.end();
return true;
}
res.end(body);
return true;
};
}
export function createDiagnosticsPrometheusExporter() {
const store = createPrometheusMetricStore();
let unsubscribe: (() => void) | undefined;
const service = {
id: "diagnostics-prometheus",
start(ctx) {
const subscribe = ctx.internalDiagnostics?.onEvent;
if (!subscribe) {
ctx.logger.error("diagnostics-prometheus: internal diagnostics capability unavailable");
return;
}
unsubscribe = subscribe((event, metadata) => {
try {
recordDiagnosticEvent(store, event, metadata);
} catch (err) {
ctx.logger.error(
`diagnostics-prometheus: event handler failed (${event.type}): ${safeErrorMessage(err)}`,
);
}
});
ctx.internalDiagnostics?.emit({
type: "telemetry.exporter",
exporter: "diagnostics-prometheus",
signal: "metrics",
status: "started",
reason: "configured",
});
},
stop() {
unsubscribe?.();
unsubscribe = undefined;
store.reset();
},
} satisfies OpenClawPluginService;
return {
handler: createMetricsHandler(store),
render: () => renderPrometheusMetrics(store),
service,
};
}
export const __test__ = {
createPrometheusMetricStore,
recordDiagnosticEvent,
renderPrometheusMetrics,
};

View File

@@ -0,0 +1,16 @@
{
"extends": "../tsconfig.package-boundary.base.json",
"compilerOptions": {
"rootDir": "."
},
"include": ["./*.ts", "./src/**/*.ts"],
"exclude": [
"./**/*.test.ts",
"./dist/**",
"./node_modules/**",
"./src/test-support/**",
"./src/**/*test-helpers.ts",
"./src/**/*test-harness.ts",
"./src/**/*test-support.ts"
]
}

View File

@@ -596,6 +596,10 @@
"types": "./dist/plugin-sdk/diagnostics-otel.d.ts",
"default": "./dist/plugin-sdk/diagnostics-otel.js"
},
"./plugin-sdk/diagnostics-prometheus": {
"types": "./dist/plugin-sdk/diagnostics-prometheus.d.ts",
"default": "./dist/plugin-sdk/diagnostics-prometheus.js"
},
"./plugin-sdk/diffs": {
"types": "./dist/plugin-sdk/diffs.d.ts",
"default": "./dist/plugin-sdk/diffs.js"

View File

@@ -134,6 +134,7 @@
"device-bootstrap",
"diagnostic-runtime",
"diagnostics-otel",
"diagnostics-prometheus",
"diffs",
"error-runtime",
"extension-shared",

View File

@@ -191,6 +191,7 @@ const LOCAL_EXTENSION_API_BARREL_GUARDS = [
"bluebubbles",
"device-pair",
"diagnostics-otel",
"diagnostics-prometheus",
"discord",
"diffs",
"feishu",

View File

@@ -0,0 +1,15 @@
// Narrow plugin-sdk surface for the bundled diagnostics-prometheus plugin.
// Keep this list additive and scoped to the bundled diagnostics-prometheus surface.
export type {
DiagnosticEventMetadata,
DiagnosticEventPayload,
} from "../infra/diagnostic-events.js";
export { redactSensitiveText } from "../logging/redact.js";
export { emptyPluginConfigSchema } from "../plugins/config-schema.js";
export type {
OpenClawPluginApi,
OpenClawPluginHttpRouteHandler,
OpenClawPluginService,
OpenClawPluginServiceContext,
} from "../plugins/types.js";

View File

@@ -180,7 +180,7 @@ describe("startPluginServices", () => {
expect(stopThrows).toHaveBeenCalledOnce();
});
it("grants internal diagnostics only to the bundled diagnostics OTEL service", async () => {
it("grants internal diagnostics only to bundled diagnostics exporter services", async () => {
const contexts: OpenClawPluginServiceContext[] = [];
const diagnosticsService = createTrackingService("diagnostics-otel", { contexts });
await startPluginServices({
@@ -191,6 +191,18 @@ describe("startPluginServices", () => {
expect(contexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function");
expect(contexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function");
const prometheusContexts: OpenClawPluginServiceContext[] = [];
const prometheusService = createTrackingService("diagnostics-prometheus", {
contexts: prometheusContexts,
});
await startPluginServices({
registry: createRegistry([prometheusService], "diagnostics-prometheus", "bundled"),
config: createServiceConfig(),
});
expect(prometheusContexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function");
expect(prometheusContexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function");
const untrustedContexts: OpenClawPluginServiceContext[] = [];
const untrustedService = createTrackingService("diagnostics-otel", {
contexts: untrustedContexts,

View File

@@ -24,14 +24,18 @@ function createServiceContext(params: {
workspaceDir?: string;
service?: PluginServiceRegistration;
}): OpenClawPluginServiceContext {
const grantsInternalDiagnostics =
params.service?.origin === "bundled" &&
params.service.pluginId === params.service.service.id &&
(params.service.service.id === "diagnostics-otel" ||
params.service.service.id === "diagnostics-prometheus");
return {
config: params.config,
workspaceDir: params.workspaceDir,
stateDir: STATE_DIR,
logger: createPluginLogger(),
...(params.service?.origin === "bundled" &&
params.service.pluginId === "diagnostics-otel" &&
params.service.service.id === "diagnostics-otel"
...(grantsInternalDiagnostics
? {
internalDiagnostics: {
emit: emitTrustedDiagnosticEvent,