diff --git a/.github/labeler.yml b/.github/labeler.yml index 045cb538252..f2391091284 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -233,6 +233,10 @@ - changed-files: - any-glob-to-any-file: - "extensions/diagnostics-otel/**" +"extensions: diagnostics-prometheus": + - changed-files: + - any-glob-to-any-file: + - "extensions/diagnostics-prometheus/**" "extensions: llm-task": - changed-files: - any-glob-to-any-file: diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ff18fccbfa..d58bf076fa0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai - Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc. - Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc. - Diagnostics/trace: propagate W3C `traceparent` headers from trusted model-call trace context to provider transports while replacing caller-supplied traceparent values. Thanks @vincentkoc. +- Diagnostics/Prometheus: add a bundled `diagnostics-prometheus` plugin with a protected gateway scrape route for low-cardinality diagnostics metrics. Thanks @vincentkoc. - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc. - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc. - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc. diff --git a/docs/docs.json b/docs/docs.json index 7157f88bfd2..11e5a8d93e5 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1442,6 +1442,7 @@ "gateway/doctor", "logging", "gateway/opentelemetry", + "gateway/prometheus", "gateway/logging", "gateway/diagnostics", "gateway/troubleshooting" diff --git a/docs/gateway/prometheus.md b/docs/gateway/prometheus.md new file mode 100644 index 00000000000..7c408aa4b33 --- /dev/null +++ b/docs/gateway/prometheus.md @@ -0,0 +1,89 @@ +--- +summary: "Expose OpenClaw diagnostics as Prometheus text metrics through the diagnostics-prometheus plugin" +title: "Prometheus metrics" +read_when: + - You want Prometheus, Grafana, VictoriaMetrics, or another scraper to collect OpenClaw Gateway metrics + - You need the Prometheus metric names and label policy for dashboards or alerts + - You want metrics without running an OpenTelemetry collector +--- + +OpenClaw can expose diagnostics metrics through the bundled +`diagnostics-prometheus` plugin. It listens to trusted internal diagnostics and +renders a Prometheus text endpoint at: + +```text +/api/diagnostics/prometheus +``` + +The route uses Gateway authentication. Do not expose it as a public +unauthenticated `/metrics` endpoint. + +## Quick start + +```json5 +{ + plugins: { + allow: ["diagnostics-prometheus"], + entries: { + "diagnostics-prometheus": { enabled: true }, + }, + }, + diagnostics: { + enabled: true, + }, +} +``` + +You can also enable the plugin from the CLI: + +```bash +openclaw plugins enable diagnostics-prometheus +``` + +Then scrape the protected Gateway route with the same Gateway authentication you +use for operator APIs. + +## Metrics exported + +| Metric | Type | Labels | +| --------------------------------------------- | --------- | ----------------------------------------------------------------------------------------- | +| `openclaw_run_completed_total` | counter | `channel`, `model`, `outcome`, `provider`, `trigger` | +| `openclaw_run_duration_seconds` | histogram | `channel`, `model`, `outcome`, `provider`, `trigger` | +| `openclaw_model_call_total` | counter | `api`, `error_category`, `model`, `outcome`, `provider`, `transport` | +| `openclaw_model_call_duration_seconds` | histogram | `api`, `error_category`, `model`, `outcome`, `provider`, `transport` | +| `openclaw_model_tokens_total` | counter | `agent`, `channel`, `model`, `provider`, `token_type` | +| `openclaw_gen_ai_client_token_usage` | histogram | `model`, `provider`, `token_type` | +| `openclaw_model_cost_usd_total` | counter | `agent`, `channel`, `model`, `provider` | +| `openclaw_tool_execution_total` | counter | `error_category`, `outcome`, `params_kind`, `tool` | +| `openclaw_tool_execution_duration_seconds` | histogram | `error_category`, `outcome`, `params_kind`, `tool` | +| `openclaw_harness_run_total` | counter | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` | +| `openclaw_harness_run_duration_seconds` | histogram | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` | +| `openclaw_message_processed_total` | counter | `channel`, `outcome`, `reason` | +| `openclaw_message_processed_duration_seconds` | histogram | `channel`, `outcome`, `reason` | +| `openclaw_message_delivery_total` | counter | `channel`, `delivery_kind`, `error_category`, `outcome` | +| `openclaw_message_delivery_duration_seconds` | histogram | `channel`, `delivery_kind`, `error_category`, `outcome` | +| `openclaw_queue_lane_size` | gauge | `lane` | +| `openclaw_queue_lane_wait_seconds` | histogram | `lane` | +| `openclaw_session_state_total` | counter | `reason`, `state` | +| `openclaw_session_queue_depth` | gauge | `state` | +| `openclaw_memory_bytes` | gauge | `kind` | +| `openclaw_memory_rss_bytes` | histogram | none | +| `openclaw_memory_pressure_total` | counter | `level`, `reason` | +| `openclaw_telemetry_exporter_total` | counter | `exporter`, `reason`, `signal`, `status` | +| `openclaw_prometheus_series_dropped_total` | counter | none | + +## Label policy + +Prometheus labels stay bounded and low-cardinality. The exporter does not emit +raw diagnostic identifiers such as `runId`, `sessionKey`, `sessionId`, `callId`, +`toolCallId`, message IDs, chat IDs, or provider request IDs. + +Label values are redacted and must match OpenClaw's low-cardinality character +policy. Values that fail the policy are replaced with `unknown`, `other`, or +`none`, depending on the metric. + +The exporter caps retained time series in memory. If the cap is reached, new +series are dropped and `openclaw_prometheus_series_dropped_total` increments. + +For full traces, logs, OTLP export, and OpenTelemetry GenAI semantic attributes, +use [OpenTelemetry export](/gateway/opentelemetry). diff --git a/docs/plugins/sdk-migration.md b/docs/plugins/sdk-migration.md index 8569cf0e5c2..c822888e0a7 100644 --- a/docs/plugins/sdk-migration.md +++ b/docs/plugins/sdk-migration.md @@ -420,8 +420,9 @@ The same rule applies to other bundled-helper families such as: `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch`, `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, - `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, - `plugin-sdk/thread-ownership`, and `plugin-sdk/voice-call` + `plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`, + `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, + and `plugin-sdk/voice-call` `plugin-sdk/github-copilot-token` currently exposes the narrow token-helper surface `DEFAULT_COPILOT_API_BASE_URL`, diff --git a/docs/plugins/sdk-subpaths.md b/docs/plugins/sdk-subpaths.md index f07103fdef7..19c1256f6fe 100644 --- a/docs/plugins/sdk-subpaths.md +++ b/docs/plugins/sdk-subpaths.md @@ -271,7 +271,7 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview) | Line | `plugin-sdk/line`, `plugin-sdk/line-core`, `plugin-sdk/line-runtime`, `plugin-sdk/line-surface` | Bundled LINE helper/runtime surface | | IRC | `plugin-sdk/irc`, `plugin-sdk/irc-surface` | Bundled IRC helper surface | | Channel-specific helpers | `plugin-sdk/googlechat`, `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles`, `plugin-sdk/bluebubbles-policy`, `plugin-sdk/mattermost`, `plugin-sdk/mattermost-policy`, `plugin-sdk/feishu-conversation`, `plugin-sdk/msteams`, `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch` | Bundled channel compatibility/helper seams | - | Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` | + | Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` | diff --git a/extensions/diagnostics-prometheus/api.ts b/extensions/diagnostics-prometheus/api.ts new file mode 100644 index 00000000000..079cfbecd8c --- /dev/null +++ b/extensions/diagnostics-prometheus/api.ts @@ -0,0 +1 @@ +export * from "openclaw/plugin-sdk/diagnostics-prometheus"; diff --git a/extensions/diagnostics-prometheus/index.ts b/extensions/diagnostics-prometheus/index.ts new file mode 100644 index 00000000000..70a13101747 --- /dev/null +++ b/extensions/diagnostics-prometheus/index.ts @@ -0,0 +1,20 @@ +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; +import { createDiagnosticsPrometheusExporter } from "./src/service.js"; + +const exporter = createDiagnosticsPrometheusExporter(); + +export default definePluginEntry({ + id: "diagnostics-prometheus", + name: "Diagnostics Prometheus", + description: "Expose OpenClaw diagnostics metrics in Prometheus text format", + register(api) { + api.registerService(exporter.service); + api.registerHttpRoute({ + path: "/api/diagnostics/prometheus", + auth: "gateway", + match: "exact", + gatewayRuntimeScopeSurface: "trusted-operator", + handler: exporter.handler, + }); + }, +}); diff --git a/extensions/diagnostics-prometheus/openclaw.plugin.json b/extensions/diagnostics-prometheus/openclaw.plugin.json new file mode 100644 index 00000000000..8bd0f4b9e67 --- /dev/null +++ b/extensions/diagnostics-prometheus/openclaw.plugin.json @@ -0,0 +1,8 @@ +{ + "id": "diagnostics-prometheus", + "configSchema": { + "type": "object", + "additionalProperties": false, + "properties": {} + } +} diff --git a/extensions/diagnostics-prometheus/package.json b/extensions/diagnostics-prometheus/package.json new file mode 100644 index 00000000000..92e8bbdb840 --- /dev/null +++ b/extensions/diagnostics-prometheus/package.json @@ -0,0 +1,24 @@ +{ + "name": "@openclaw/diagnostics-prometheus", + "version": "2026.4.25", + "description": "OpenClaw diagnostics Prometheus exporter", + "type": "module", + "devDependencies": { + "@openclaw/plugin-sdk": "workspace:*" + }, + "openclaw": { + "extensions": [ + "./index.ts" + ], + "compat": { + "pluginApi": ">=2026.4.25" + }, + "build": { + "openclawVersion": "2026.4.25" + }, + "release": { + "publishToClawHub": true, + "publishToNpm": true + } + } +} diff --git a/extensions/diagnostics-prometheus/src/service.test.ts b/extensions/diagnostics-prometheus/src/service.test.ts new file mode 100644 index 00000000000..f3bfba0f4c6 --- /dev/null +++ b/extensions/diagnostics-prometheus/src/service.test.ts @@ -0,0 +1,169 @@ +import { describe, expect, it, vi } from "vitest"; +import type { DiagnosticEventMetadata, DiagnosticEventPayload } from "../api.js"; +import { createDiagnosticsPrometheusExporter, __test__ } from "./service.js"; + +const trusted: DiagnosticEventMetadata = Object.freeze({ trusted: true }); +const untrusted: DiagnosticEventMetadata = Object.freeze({ trusted: false }); + +function baseEvent(): Pick { + return { seq: 1, ts: 1700000000000 }; +} + +describe("diagnostics-prometheus service", () => { + it("records trusted run metrics without raw diagnostic identifiers", () => { + const store = __test__.createPrometheusMetricStore(); + + __test__.recordDiagnosticEvent( + store, + { + ...baseEvent(), + type: "run.completed", + runId: "run-should-not-export", + sessionKey: "session-should-not-export", + provider: "openai", + model: "gpt-5.4", + channel: "discord", + trigger: "message", + durationMs: 1500, + outcome: "completed", + }, + trusted, + ); + + const rendered = __test__.renderPrometheusMetrics(store); + + expect(rendered).toContain("# TYPE openclaw_run_completed_total counter"); + expect(rendered).toContain( + 'openclaw_run_completed_total{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1', + ); + expect(rendered).toContain( + 'openclaw_run_duration_seconds_sum{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1.5', + ); + expect(rendered).not.toContain("run-should-not-export"); + expect(rendered).not.toContain("session-should-not-export"); + }); + + it("drops untrusted plugin-emitted diagnostic events", () => { + const store = __test__.createPrometheusMetricStore(); + + __test__.recordDiagnosticEvent( + store, + { + ...baseEvent(), + type: "model.call.completed", + runId: "run-1", + callId: "call-1", + provider: "openai", + model: "gpt-5.4", + durationMs: 10, + }, + untrusted, + ); + + expect(__test__.renderPrometheusMetrics(store)).toBe(""); + }); + + it("redacts and bounds label values", () => { + const store = __test__.createPrometheusMetricStore(); + + __test__.recordDiagnosticEvent( + store, + { + ...baseEvent(), + type: "tool.execution.error", + toolName: "shell\nbad", + durationMs: 25, + errorCategory: "Bearer sk-secret-token-value", + }, + trusted, + ); + + const rendered = __test__.renderPrometheusMetrics(store); + + expect(rendered).toContain( + 'openclaw_tool_execution_total{error_category="other",outcome="error",params_kind="unknown",tool="tool"} 1', + ); + expect(rendered).not.toContain("Bearer"); + expect(rendered).not.toContain("sk-secret"); + }); + + it("caps metric series growth and reports dropped series", () => { + const store = __test__.createPrometheusMetricStore(); + + for (let index = 0; index < 2100; index += 1) { + __test__.recordDiagnosticEvent( + store, + { + ...baseEvent(), + type: "model.call.completed", + runId: `run-${index}`, + callId: `call-${index}`, + provider: "openai", + model: `model.${index}`, + durationMs: 10, + }, + trusted, + ); + } + + const rendered = __test__.renderPrometheusMetrics(store); + + expect(rendered).toContain("# TYPE openclaw_prometheus_series_dropped_total counter"); + expect(rendered).toContain("openclaw_prometheus_series_dropped_total "); + }); + + it("subscribes to internal diagnostics and renders scrape text", () => { + const listeners: Array< + (event: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => void + > = []; + const emitted: unknown[] = []; + const exporter = createDiagnosticsPrometheusExporter(); + const unsubscribe = vi.fn(); + + exporter.service.start({ + config: {} as never, + stateDir: "/tmp/openclaw-prometheus-test", + logger: { + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, + internalDiagnostics: { + emit: (event) => emitted.push(event), + onEvent: (listener) => { + listeners.push(listener); + return unsubscribe; + }, + }, + }); + + listeners[0]?.( + { + ...baseEvent(), + type: "model.usage", + provider: "openai", + model: "gpt-5.4", + usage: { input: 12, output: 3, total: 15 }, + }, + trusted, + ); + + expect(emitted).toContainEqual( + expect.objectContaining({ + type: "telemetry.exporter", + exporter: "diagnostics-prometheus", + signal: "metrics", + status: "started", + }), + ); + expect(exporter.render()).toContain( + 'openclaw_model_tokens_total{agent="unknown",channel="unknown",model="gpt-5.4",provider="openai",token_type="input"} 12', + ); + + exporter.service.stop?.(); + + expect(unsubscribe).toHaveBeenCalledOnce(); + expect(exporter.render()).toBe(""); + }); +}); diff --git a/extensions/diagnostics-prometheus/src/service.ts b/extensions/diagnostics-prometheus/src/service.ts new file mode 100644 index 00000000000..3b2010f1bb0 --- /dev/null +++ b/extensions/diagnostics-prometheus/src/service.ts @@ -0,0 +1,684 @@ +import type { IncomingMessage, ServerResponse } from "node:http"; +import type { + DiagnosticEventMetadata, + DiagnosticEventPayload, + OpenClawPluginHttpRouteHandler, + OpenClawPluginService, +} from "../api.js"; +import { redactSensitiveText } from "../api.js"; + +type LabelSet = Record; + +type CounterSample = { + help: string; + labels: LabelSet; + value: number; +}; + +type HistogramSample = { + buckets: number[]; + counts: number[]; + count: number; + help: string; + labels: LabelSet; + sum: number; +}; + +type GaugeSample = { + help: string; + labels: LabelSet; + value: number; +}; + +type MetricSnapshot = { + counters: Map; + gauges: Map; + histograms: Map; +}; + +type PrometheusMetricStore = ReturnType; + +const DURATION_BUCKETS_SECONDS = [ + 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600, +]; +const TOKEN_BUCKETS = [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576]; +const BYTE_BUCKETS = [ + 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864, 268435456, 1073741824, + 4294967296, 17179869184, +]; +const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u; +const MAX_PROMETHEUS_SERIES = 2048; +const DROPPED_SERIES_COUNTER_NAME = "openclaw_prometheus_series_dropped_total"; + +function lowCardinalityLabel(value: string | undefined, fallback = "unknown"): string { + if (!value) { + return fallback; + } + const redacted = redactSensitiveText(value.trim()); + return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback; +} + +function numericValue(value: number | undefined): number | undefined { + return typeof value === "number" && Number.isFinite(value) && value >= 0 ? value : undefined; +} + +function seconds(ms: number | undefined): number | undefined { + const value = numericValue(ms); + return value === undefined ? undefined : value / 1000; +} + +function sortedLabels(labels: LabelSet): [string, string][] { + return Object.entries(labels).toSorted(([left], [right]) => left.localeCompare(right)); +} + +function metricKey(name: string, labels: LabelSet): string { + return `${name}|${JSON.stringify(sortedLabels(labels))}`; +} + +function escapeHelp(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n"); +} + +function escapeLabelValue(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, '\\"'); +} + +function formatLabels(labels: LabelSet): string { + const entries = sortedLabels(labels); + if (entries.length === 0) { + return ""; + } + return `{${entries.map(([key, value]) => `${key}="${escapeLabelValue(value)}"`).join(",")}}`; +} + +function formatPrometheusNumber(value: number): string { + if (!Number.isFinite(value)) { + return "0"; + } + return Number.isInteger(value) ? String(value) : String(Number(value.toPrecision(12))); +} + +function createPrometheusMetricStore() { + const counters = new Map(); + const gauges = new Map(); + const histograms = new Map(); + let droppedSeries = 0; + + const canCreateSeries = (map: Map, key: string, metricName: string): boolean => { + if (map.has(key)) { + return true; + } + if (metricName === DROPPED_SERIES_COUNTER_NAME) { + return true; + } + if (counters.size + gauges.size + histograms.size < MAX_PROMETHEUS_SERIES) { + return true; + } + droppedSeries += 1; + return false; + }; + + const counter = (name: string, help: string, labels: LabelSet, amount = 1) => { + if (!Number.isFinite(amount) || amount <= 0) { + return; + } + const key = metricKey(name, labels); + if (!canCreateSeries(counters, key, name)) { + return; + } + const existing = counters.get(key); + if (existing) { + existing.value += amount; + return; + } + counters.set(key, { help, labels, value: amount }); + }; + + const gauge = (name: string, help: string, labels: LabelSet, value: number | undefined) => { + if (value === undefined || !Number.isFinite(value)) { + return; + } + const key = metricKey(name, labels); + if (!canCreateSeries(gauges, key, name)) { + return; + } + gauges.set(key, { help, labels, value }); + }; + + const histogram = ( + name: string, + help: string, + labels: LabelSet, + value: number | undefined, + buckets = DURATION_BUCKETS_SECONDS, + ) => { + if (value === undefined || !Number.isFinite(value) || value < 0) { + return; + } + const key = metricKey(name, labels); + if (!canCreateSeries(histograms, key, name)) { + return; + } + let sample = histograms.get(key); + if (!sample) { + sample = { + buckets, + counts: buckets.map(() => 0), + count: 0, + help, + labels, + sum: 0, + }; + histograms.set(key, sample); + } + sample.count += 1; + sample.sum += value; + for (let index = 0; index < sample.buckets.length; index += 1) { + const bucket = sample.buckets[index]; + if (bucket !== undefined && value <= bucket) { + sample.counts[index] = (sample.counts[index] ?? 0) + 1; + } + } + }; + + const snapshot = (): MetricSnapshot => { + const counterSnapshot = new Map(counters); + if (droppedSeries > 0) { + counterSnapshot.set(metricKey(DROPPED_SERIES_COUNTER_NAME, {}), { + help: "Prometheus metric series dropped because the exporter series cap was reached.", + labels: {}, + value: droppedSeries, + }); + } + return { + counters: counterSnapshot, + gauges: new Map(gauges), + histograms: new Map(histograms), + }; + }; + + const reset = () => { + counters.clear(); + gauges.clear(); + histograms.clear(); + droppedSeries = 0; + }; + + return { counter, gauge, histogram, reset, snapshot }; +} + +function safeErrorMessage(err: unknown): string { + const message = err instanceof Error ? (err.message ?? err.name) : String(err); + return redactSensitiveText(message) + .replaceAll("\u0000", " ") + .replace(/[\r\n\t\u2028\u2029]/gu, " ") + .slice(0, 500); +} + +function renderPrometheusMetrics(store: PrometheusMetricStore): string { + const snapshot = store.snapshot(); + const lines: string[] = []; + const emitted = new Set(); + + const emitHeader = (name: string, type: "counter" | "gauge" | "histogram", help: string) => { + if (emitted.has(name)) { + return; + } + emitted.add(name); + lines.push(`# HELP ${name} ${escapeHelp(help)}`); + lines.push(`# TYPE ${name} ${type}`); + }; + + const counterEntries = [...snapshot.counters.entries()].toSorted(([left], [right]) => + left.localeCompare(right), + ); + for (const [key, sample] of counterEntries) { + const name = key.split("|", 1)[0] ?? ""; + emitHeader(name, "counter", sample.help); + lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`); + } + + const gaugeEntries = [...snapshot.gauges.entries()].toSorted(([left], [right]) => + left.localeCompare(right), + ); + for (const [key, sample] of gaugeEntries) { + const name = key.split("|", 1)[0] ?? ""; + emitHeader(name, "gauge", sample.help); + lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`); + } + + const histogramEntries = [...snapshot.histograms.entries()].toSorted(([left], [right]) => + left.localeCompare(right), + ); + for (const [key, sample] of histogramEntries) { + const name = key.split("|", 1)[0] ?? ""; + emitHeader(name, "histogram", sample.help); + for (let index = 0; index < sample.buckets.length; index += 1) { + const bucket = sample.buckets[index]; + if (bucket === undefined) { + continue; + } + lines.push( + `${name}_bucket${formatLabels({ ...sample.labels, le: String(bucket) })} ${formatPrometheusNumber(sample.counts[index] ?? 0)}`, + ); + } + lines.push( + `${name}_bucket${formatLabels({ ...sample.labels, le: "+Inf" })} ${formatPrometheusNumber(sample.count)}`, + ); + lines.push(`${name}_sum${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.sum)}`); + lines.push( + `${name}_count${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.count)}`, + ); + } + + lines.push(""); + return lines.join("\n"); +} + +function runLabels(evt: { + channel?: string; + model?: string; + outcome?: string; + provider?: string; + trigger?: string; +}): LabelSet { + return { + channel: lowCardinalityLabel(evt.channel), + model: lowCardinalityLabel(evt.model), + outcome: lowCardinalityLabel(evt.outcome, "unknown"), + provider: lowCardinalityLabel(evt.provider), + trigger: lowCardinalityLabel(evt.trigger), + }; +} + +function modelCallLabels(evt: { + api?: string; + errorCategory?: string; + model?: string; + provider?: string; + transport?: string; + type: string; +}): LabelSet { + return { + api: lowCardinalityLabel(evt.api), + error_category: + evt.type === "model.call.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none", + model: lowCardinalityLabel(evt.model), + outcome: evt.type === "model.call.error" ? "error" : "completed", + provider: lowCardinalityLabel(evt.provider), + transport: lowCardinalityLabel(evt.transport), + }; +} + +function toolExecutionLabels(evt: { + errorCategory?: string; + paramsSummary?: { kind: string }; + toolName: string; + type: string; +}): LabelSet { + return { + error_category: + evt.type === "tool.execution.error" + ? lowCardinalityLabel(evt.errorCategory, "other") + : "none", + outcome: evt.type === "tool.execution.error" ? "error" : "completed", + params_kind: lowCardinalityLabel(evt.paramsSummary?.kind), + tool: lowCardinalityLabel(evt.toolName, "tool"), + }; +} + +function harnessLabels(evt: { + channel?: string; + errorCategory?: string; + harnessId: string; + model?: string; + outcome?: string; + phase?: string; + pluginId?: string; + provider?: string; + type: string; +}): LabelSet { + return { + channel: lowCardinalityLabel(evt.channel), + error_category: + evt.type === "harness.run.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none", + harness: lowCardinalityLabel(evt.harnessId), + model: lowCardinalityLabel(evt.model), + outcome: evt.type === "harness.run.error" ? "error" : lowCardinalityLabel(evt.outcome), + phase: evt.type === "harness.run.error" ? lowCardinalityLabel(evt.phase) : "none", + plugin: lowCardinalityLabel(evt.pluginId), + provider: lowCardinalityLabel(evt.provider), + }; +} + +function recordModelUsage( + store: PrometheusMetricStore, + evt: Extract, +) { + const labels = { + agent: lowCardinalityLabel(evt.agentId), + channel: lowCardinalityLabel(evt.channel), + model: lowCardinalityLabel(evt.model), + provider: lowCardinalityLabel(evt.provider), + }; + const usage = evt.usage; + const recordTokens = (tokenType: string, value: number | undefined) => { + const amount = numericValue(value); + if (amount === undefined || amount === 0) { + return; + } + store.counter( + "openclaw_model_tokens_total", + "Model tokens reported by diagnostic usage events.", + { + ...labels, + token_type: tokenType, + }, + amount, + ); + if (tokenType === "input" || tokenType === "output") { + store.histogram( + "openclaw_gen_ai_client_token_usage", + "GenAI token usage distribution for input and output tokens.", + { + model: labels.model, + provider: labels.provider, + token_type: tokenType, + }, + amount, + TOKEN_BUCKETS, + ); + } + }; + + recordTokens("input", usage.input); + recordTokens("output", usage.output); + recordTokens("cache_read", usage.cacheRead); + recordTokens("cache_write", usage.cacheWrite); + recordTokens("prompt", usage.promptTokens); + recordTokens("total", usage.total); + + store.counter( + "openclaw_model_cost_usd_total", + "Estimated model cost in USD reported by diagnostic usage events.", + labels, + numericValue(evt.costUsd) ?? 0, + ); + store.histogram( + "openclaw_model_usage_duration_seconds", + "Model usage event duration in seconds.", + labels, + seconds(evt.durationMs), + ); +} + +function recordDiagnosticEvent( + store: PrometheusMetricStore, + evt: DiagnosticEventPayload, + metadata: DiagnosticEventMetadata, +): void { + if (!metadata.trusted) { + return; + } + + switch (evt.type) { + case "model.usage": + recordModelUsage(store, evt); + return; + case "run.completed": + store.histogram( + "openclaw_run_duration_seconds", + "Agent run duration in seconds.", + runLabels(evt), + seconds(evt.durationMs), + ); + store.counter( + "openclaw_run_completed_total", + "Agent runs completed by outcome.", + runLabels(evt), + ); + return; + case "model.call.completed": + case "model.call.error": + store.histogram( + "openclaw_model_call_duration_seconds", + "Provider model call duration in seconds.", + modelCallLabels(evt), + seconds(evt.durationMs), + ); + store.counter( + "openclaw_model_call_total", + "Provider model calls completed by outcome.", + modelCallLabels(evt), + ); + return; + case "tool.execution.completed": + case "tool.execution.error": + store.histogram( + "openclaw_tool_execution_duration_seconds", + "Tool execution duration in seconds.", + toolExecutionLabels(evt), + seconds(evt.durationMs), + ); + store.counter( + "openclaw_tool_execution_total", + "Tool executions completed by outcome.", + toolExecutionLabels(evt), + ); + return; + case "harness.run.completed": + case "harness.run.error": + store.histogram( + "openclaw_harness_run_duration_seconds", + "Agent harness run duration in seconds.", + harnessLabels(evt), + seconds(evt.durationMs), + ); + store.counter( + "openclaw_harness_run_total", + "Agent harness runs completed by outcome.", + harnessLabels(evt), + ); + return; + case "message.processed": + store.counter("openclaw_message_processed_total", "Inbound messages processed by outcome.", { + channel: lowCardinalityLabel(evt.channel), + outcome: evt.outcome, + reason: lowCardinalityLabel(evt.reason, "none"), + }); + store.histogram( + "openclaw_message_processed_duration_seconds", + "Inbound message processing duration in seconds.", + { + channel: lowCardinalityLabel(evt.channel), + outcome: evt.outcome, + reason: lowCardinalityLabel(evt.reason, "none"), + }, + seconds(evt.durationMs), + ); + return; + case "message.delivery.completed": + case "message.delivery.error": + store.counter( + "openclaw_message_delivery_total", + "Outbound message delivery attempts by outcome.", + { + channel: lowCardinalityLabel(evt.channel), + delivery_kind: evt.deliveryKind, + error_category: + evt.type === "message.delivery.error" + ? lowCardinalityLabel(evt.errorCategory, "other") + : "none", + outcome: evt.type === "message.delivery.error" ? "error" : "completed", + }, + ); + store.histogram( + "openclaw_message_delivery_duration_seconds", + "Outbound message delivery duration in seconds.", + { + channel: lowCardinalityLabel(evt.channel), + delivery_kind: evt.deliveryKind, + error_category: + evt.type === "message.delivery.error" + ? lowCardinalityLabel(evt.errorCategory, "other") + : "none", + outcome: evt.type === "message.delivery.error" ? "error" : "completed", + }, + seconds(evt.durationMs), + ); + return; + case "queue.lane.enqueue": + case "queue.lane.dequeue": + store.gauge( + "openclaw_queue_lane_size", + "Current diagnostic queue lane size.", + { + lane: lowCardinalityLabel(evt.lane), + }, + numericValue(evt.queueSize), + ); + if (evt.type === "queue.lane.dequeue") { + store.histogram( + "openclaw_queue_lane_wait_seconds", + "Queue lane wait time in seconds.", + { lane: lowCardinalityLabel(evt.lane) }, + seconds(evt.waitMs), + ); + } + return; + case "session.state": + store.counter("openclaw_session_state_total", "Session state observations.", { + reason: lowCardinalityLabel(evt.reason, "none"), + state: evt.state, + }); + if (evt.queueDepth !== undefined) { + store.gauge( + "openclaw_session_queue_depth", + "Latest observed session queue depth.", + { + state: evt.state, + }, + numericValue(evt.queueDepth), + ); + } + return; + case "diagnostic.memory.sample": + store.gauge( + "openclaw_memory_bytes", + "Latest process memory usage by memory kind.", + { kind: "rss" }, + evt.memory.rssBytes, + ); + store.gauge( + "openclaw_memory_bytes", + "Latest process memory usage by memory kind.", + { kind: "heap_total" }, + evt.memory.heapTotalBytes, + ); + store.gauge( + "openclaw_memory_bytes", + "Latest process memory usage by memory kind.", + { kind: "heap_used" }, + evt.memory.heapUsedBytes, + ); + store.histogram( + "openclaw_memory_rss_bytes", + "RSS memory sample distribution in bytes.", + {}, + numericValue(evt.memory.rssBytes), + BYTE_BUCKETS, + ); + return; + case "diagnostic.memory.pressure": + store.counter( + "openclaw_memory_pressure_total", + "Memory pressure events by level and reason.", + { + level: evt.level, + reason: evt.reason, + }, + ); + return; + case "telemetry.exporter": + store.counter("openclaw_telemetry_exporter_total", "Telemetry exporter lifecycle events.", { + exporter: lowCardinalityLabel(evt.exporter), + reason: lowCardinalityLabel(evt.reason, "none"), + signal: evt.signal, + status: evt.status, + }); + return; + default: + return; + } +} + +function createMetricsHandler(store: PrometheusMetricStore): OpenClawPluginHttpRouteHandler { + return (req: IncomingMessage, res: ServerResponse) => { + if (req.method !== "GET" && req.method !== "HEAD") { + res.statusCode = 405; + res.setHeader("Allow", "GET, HEAD"); + res.end("Method Not Allowed"); + return true; + } + + const body = renderPrometheusMetrics(store); + res.statusCode = 200; + res.setHeader("Cache-Control", "no-store"); + res.setHeader("Content-Type", "text/plain; version=0.0.4; charset=utf-8"); + if (req.method === "HEAD") { + res.end(); + return true; + } + res.end(body); + return true; + }; +} + +export function createDiagnosticsPrometheusExporter() { + const store = createPrometheusMetricStore(); + let unsubscribe: (() => void) | undefined; + + const service = { + id: "diagnostics-prometheus", + start(ctx) { + const subscribe = ctx.internalDiagnostics?.onEvent; + if (!subscribe) { + ctx.logger.error("diagnostics-prometheus: internal diagnostics capability unavailable"); + return; + } + unsubscribe = subscribe((event, metadata) => { + try { + recordDiagnosticEvent(store, event, metadata); + } catch (err) { + ctx.logger.error( + `diagnostics-prometheus: event handler failed (${event.type}): ${safeErrorMessage(err)}`, + ); + } + }); + ctx.internalDiagnostics?.emit({ + type: "telemetry.exporter", + exporter: "diagnostics-prometheus", + signal: "metrics", + status: "started", + reason: "configured", + }); + }, + stop() { + unsubscribe?.(); + unsubscribe = undefined; + store.reset(); + }, + } satisfies OpenClawPluginService; + + return { + handler: createMetricsHandler(store), + render: () => renderPrometheusMetrics(store), + service, + }; +} + +export const __test__ = { + createPrometheusMetricStore, + recordDiagnosticEvent, + renderPrometheusMetrics, +}; diff --git a/extensions/diagnostics-prometheus/tsconfig.json b/extensions/diagnostics-prometheus/tsconfig.json new file mode 100644 index 00000000000..b8a85a99ac3 --- /dev/null +++ b/extensions/diagnostics-prometheus/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../tsconfig.package-boundary.base.json", + "compilerOptions": { + "rootDir": "." + }, + "include": ["./*.ts", "./src/**/*.ts"], + "exclude": [ + "./**/*.test.ts", + "./dist/**", + "./node_modules/**", + "./src/test-support/**", + "./src/**/*test-helpers.ts", + "./src/**/*test-harness.ts", + "./src/**/*test-support.ts" + ] +} diff --git a/package.json b/package.json index 1a83d946cfc..35d32f93e50 100644 --- a/package.json +++ b/package.json @@ -596,6 +596,10 @@ "types": "./dist/plugin-sdk/diagnostics-otel.d.ts", "default": "./dist/plugin-sdk/diagnostics-otel.js" }, + "./plugin-sdk/diagnostics-prometheus": { + "types": "./dist/plugin-sdk/diagnostics-prometheus.d.ts", + "default": "./dist/plugin-sdk/diagnostics-prometheus.js" + }, "./plugin-sdk/diffs": { "types": "./dist/plugin-sdk/diffs.d.ts", "default": "./dist/plugin-sdk/diffs.js" diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json index a6ab71c473d..3b01bf352fa 100644 --- a/scripts/lib/plugin-sdk-entrypoints.json +++ b/scripts/lib/plugin-sdk-entrypoints.json @@ -134,6 +134,7 @@ "device-bootstrap", "diagnostic-runtime", "diagnostics-otel", + "diagnostics-prometheus", "diffs", "error-runtime", "extension-shared", diff --git a/src/channels/plugins/contracts/channel-import-guardrails.test.ts b/src/channels/plugins/contracts/channel-import-guardrails.test.ts index 07a6d8a8a5f..b13319cce88 100644 --- a/src/channels/plugins/contracts/channel-import-guardrails.test.ts +++ b/src/channels/plugins/contracts/channel-import-guardrails.test.ts @@ -191,6 +191,7 @@ const LOCAL_EXTENSION_API_BARREL_GUARDS = [ "bluebubbles", "device-pair", "diagnostics-otel", + "diagnostics-prometheus", "discord", "diffs", "feishu", diff --git a/src/plugin-sdk/diagnostics-prometheus.ts b/src/plugin-sdk/diagnostics-prometheus.ts new file mode 100644 index 00000000000..505a3fe3483 --- /dev/null +++ b/src/plugin-sdk/diagnostics-prometheus.ts @@ -0,0 +1,15 @@ +// Narrow plugin-sdk surface for the bundled diagnostics-prometheus plugin. +// Keep this list additive and scoped to the bundled diagnostics-prometheus surface. + +export type { + DiagnosticEventMetadata, + DiagnosticEventPayload, +} from "../infra/diagnostic-events.js"; +export { redactSensitiveText } from "../logging/redact.js"; +export { emptyPluginConfigSchema } from "../plugins/config-schema.js"; +export type { + OpenClawPluginApi, + OpenClawPluginHttpRouteHandler, + OpenClawPluginService, + OpenClawPluginServiceContext, +} from "../plugins/types.js"; diff --git a/src/plugins/services.test.ts b/src/plugins/services.test.ts index 361a095a67d..c3f2c2f3d54 100644 --- a/src/plugins/services.test.ts +++ b/src/plugins/services.test.ts @@ -180,7 +180,7 @@ describe("startPluginServices", () => { expect(stopThrows).toHaveBeenCalledOnce(); }); - it("grants internal diagnostics only to the bundled diagnostics OTEL service", async () => { + it("grants internal diagnostics only to bundled diagnostics exporter services", async () => { const contexts: OpenClawPluginServiceContext[] = []; const diagnosticsService = createTrackingService("diagnostics-otel", { contexts }); await startPluginServices({ @@ -191,6 +191,18 @@ describe("startPluginServices", () => { expect(contexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function"); expect(contexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function"); + const prometheusContexts: OpenClawPluginServiceContext[] = []; + const prometheusService = createTrackingService("diagnostics-prometheus", { + contexts: prometheusContexts, + }); + await startPluginServices({ + registry: createRegistry([prometheusService], "diagnostics-prometheus", "bundled"), + config: createServiceConfig(), + }); + + expect(prometheusContexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function"); + expect(prometheusContexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function"); + const untrustedContexts: OpenClawPluginServiceContext[] = []; const untrustedService = createTrackingService("diagnostics-otel", { contexts: untrustedContexts, diff --git a/src/plugins/services.ts b/src/plugins/services.ts index db5dd513572..a92ed4abe9b 100644 --- a/src/plugins/services.ts +++ b/src/plugins/services.ts @@ -24,14 +24,18 @@ function createServiceContext(params: { workspaceDir?: string; service?: PluginServiceRegistration; }): OpenClawPluginServiceContext { + const grantsInternalDiagnostics = + params.service?.origin === "bundled" && + params.service.pluginId === params.service.service.id && + (params.service.service.id === "diagnostics-otel" || + params.service.service.id === "diagnostics-prometheus"); + return { config: params.config, workspaceDir: params.workspaceDir, stateDir: STATE_DIR, logger: createPluginLogger(), - ...(params.service?.origin === "bundled" && - params.service.pluginId === "diagnostics-otel" && - params.service.service.id === "diagnostics-otel" + ...(grantsInternalDiagnostics ? { internalDiagnostics: { emit: emitTrustedDiagnosticEvent,