feat(diagnostics-prometheus): add protected metrics exporter

2026-05-06 05:50:43 +00:00 · 2026-04-26 01:05:45 -07:00
parent 6cd047e7c2
commit 0f2e7510cb
19 changed files with 1062 additions and 7 deletions
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -233,6 +233,10 @@
  - changed-files:
      - any-glob-to-any-file:
          - "extensions/diagnostics-otel/**"
+"extensions: diagnostics-prometheus":
+  - changed-files:
+      - any-glob-to-any-file:
+          - "extensions/diagnostics-prometheus/**"
 "extensions: llm-task":
  - changed-files:
      - any-glob-to-any-file:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
 - Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc.
 - Diagnostics/trace: propagate W3C `traceparent` headers from trusted model-call trace context to provider transports while replacing caller-supplied traceparent values. Thanks @vincentkoc.
+- Diagnostics/Prometheus: add a bundled `diagnostics-prometheus` plugin with a protected gateway scrape route for low-cardinality diagnostics metrics. Thanks @vincentkoc.
 - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
 - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
 - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -1442,6 +1442,7 @@
                      "gateway/doctor",
                      "logging",
                      "gateway/opentelemetry",
+                      "gateway/prometheus",
                      "gateway/logging",
                      "gateway/diagnostics",
                      "gateway/troubleshooting"
--- a/docs/gateway/prometheus.md
+++ b/docs/gateway/prometheus.md
@@ -0,0 +1,89 @@
+---
+summary: "Expose OpenClaw diagnostics as Prometheus text metrics through the diagnostics-prometheus plugin"
+title: "Prometheus metrics"
+read_when:
+  - You want Prometheus, Grafana, VictoriaMetrics, or another scraper to collect OpenClaw Gateway metrics
+  - You need the Prometheus metric names and label policy for dashboards or alerts
+  - You want metrics without running an OpenTelemetry collector
+---
+
+OpenClaw can expose diagnostics metrics through the bundled
+`diagnostics-prometheus` plugin. It listens to trusted internal diagnostics and
+renders a Prometheus text endpoint at:
+
+```text
+/api/diagnostics/prometheus
+```
+
+The route uses Gateway authentication. Do not expose it as a public
+unauthenticated `/metrics` endpoint.
+
+## Quick start
+
+```json5
+{
+  plugins: {
+    allow: ["diagnostics-prometheus"],
+    entries: {
+      "diagnostics-prometheus": { enabled: true },
+    },
+  },
+  diagnostics: {
+    enabled: true,
+  },
+}
+```
+
+You can also enable the plugin from the CLI:
+
+```bash
+openclaw plugins enable diagnostics-prometheus
+```
+
+Then scrape the protected Gateway route with the same Gateway authentication you
+use for operator APIs.
+
+## Metrics exported
+
+| Metric                                        | Type      | Labels                                                                                    |
+| --------------------------------------------- | --------- | ----------------------------------------------------------------------------------------- |
+| `openclaw_run_completed_total`                | counter   | `channel`, `model`, `outcome`, `provider`, `trigger`                                      |
+| `openclaw_run_duration_seconds`               | histogram | `channel`, `model`, `outcome`, `provider`, `trigger`                                      |
+| `openclaw_model_call_total`                   | counter   | `api`, `error_category`, `model`, `outcome`, `provider`, `transport`                      |
+| `openclaw_model_call_duration_seconds`        | histogram | `api`, `error_category`, `model`, `outcome`, `provider`, `transport`                      |
+| `openclaw_model_tokens_total`                 | counter   | `agent`, `channel`, `model`, `provider`, `token_type`                                     |
+| `openclaw_gen_ai_client_token_usage`          | histogram | `model`, `provider`, `token_type`                                                         |
+| `openclaw_model_cost_usd_total`               | counter   | `agent`, `channel`, `model`, `provider`                                                   |
+| `openclaw_tool_execution_total`               | counter   | `error_category`, `outcome`, `params_kind`, `tool`                                        |
+| `openclaw_tool_execution_duration_seconds`    | histogram | `error_category`, `outcome`, `params_kind`, `tool`                                        |
+| `openclaw_harness_run_total`                  | counter   | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` |
+| `openclaw_harness_run_duration_seconds`       | histogram | `channel`, `error_category`, `harness`, `model`, `outcome`, `phase`, `plugin`, `provider` |
+| `openclaw_message_processed_total`            | counter   | `channel`, `outcome`, `reason`                                                            |
+| `openclaw_message_processed_duration_seconds` | histogram | `channel`, `outcome`, `reason`                                                            |
+| `openclaw_message_delivery_total`             | counter   | `channel`, `delivery_kind`, `error_category`, `outcome`                                   |
+| `openclaw_message_delivery_duration_seconds`  | histogram | `channel`, `delivery_kind`, `error_category`, `outcome`                                   |
+| `openclaw_queue_lane_size`                    | gauge     | `lane`                                                                                    |
+| `openclaw_queue_lane_wait_seconds`            | histogram | `lane`                                                                                    |
+| `openclaw_session_state_total`                | counter   | `reason`, `state`                                                                         |
+| `openclaw_session_queue_depth`                | gauge     | `state`                                                                                   |
+| `openclaw_memory_bytes`                       | gauge     | `kind`                                                                                    |
+| `openclaw_memory_rss_bytes`                   | histogram | none                                                                                      |
+| `openclaw_memory_pressure_total`              | counter   | `level`, `reason`                                                                         |
+| `openclaw_telemetry_exporter_total`           | counter   | `exporter`, `reason`, `signal`, `status`                                                  |
+| `openclaw_prometheus_series_dropped_total`    | counter   | none                                                                                      |
+
+## Label policy
+
+Prometheus labels stay bounded and low-cardinality. The exporter does not emit
+raw diagnostic identifiers such as `runId`, `sessionKey`, `sessionId`, `callId`,
+`toolCallId`, message IDs, chat IDs, or provider request IDs.
+
+Label values are redacted and must match OpenClaw's low-cardinality character
+policy. Values that fail the policy are replaced with `unknown`, `other`, or
+`none`, depending on the metric.
+
+The exporter caps retained time series in memory. If the cap is reached, new
+series are dropped and `openclaw_prometheus_series_dropped_total` increments.
+
+For full traces, logs, OTLP export, and OpenTelemetry GenAI semantic attributes,
+use [OpenTelemetry export](/gateway/opentelemetry).
--- a/docs/plugins/sdk-migration.md
+++ b/docs/plugins/sdk-migration.md
@@ -420,8 +420,9 @@ The same rule applies to other bundled-helper families such as:
  `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`,
  `plugin-sdk/twitch`,
  `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`,
-  `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`,
-  `plugin-sdk/thread-ownership`, and `plugin-sdk/voice-call`
+  `plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`,
+  `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`,
+  and `plugin-sdk/voice-call`

 `plugin-sdk/github-copilot-token` currently exposes the narrow token-helper
 surface `DEFAULT_COPILOT_API_BASE_URL`,
--- a/docs/plugins/sdk-subpaths.md
+++ b/docs/plugins/sdk-subpaths.md
@@ -271,7 +271,7 @@ For the plugin authoring guide, see [Plugin SDK overview](/plugins/sdk-overview)
    | Line | `plugin-sdk/line`, `plugin-sdk/line-core`, `plugin-sdk/line-runtime`, `plugin-sdk/line-surface` | Bundled LINE helper/runtime surface |
    | IRC | `plugin-sdk/irc`, `plugin-sdk/irc-surface` | Bundled IRC helper surface |
    | Channel-specific helpers | `plugin-sdk/googlechat`, `plugin-sdk/zalouser`, `plugin-sdk/bluebubbles`, `plugin-sdk/bluebubbles-policy`, `plugin-sdk/mattermost`, `plugin-sdk/mattermost-policy`, `plugin-sdk/feishu-conversation`, `plugin-sdk/msteams`, `plugin-sdk/nextcloud-talk`, `plugin-sdk/nostr`, `plugin-sdk/tlon`, `plugin-sdk/twitch` | Bundled channel compatibility/helper seams |
-    | Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` |
+    | Auth/plugin-specific helpers | `plugin-sdk/github-copilot-login`, `plugin-sdk/github-copilot-token`, `plugin-sdk/diagnostics-otel`, `plugin-sdk/diagnostics-prometheus`, `plugin-sdk/diffs`, `plugin-sdk/llm-task`, `plugin-sdk/thread-ownership`, `plugin-sdk/voice-call` | Bundled feature/plugin helper seams; `plugin-sdk/github-copilot-token` currently exports `DEFAULT_COPILOT_API_BASE_URL`, `deriveCopilotApiBaseUrlFromToken`, and `resolveCopilotApiToken` |
  </Accordion>
 </AccordionGroup>

--- a/extensions/diagnostics-prometheus/api.ts
+++ b/extensions/diagnostics-prometheus/api.ts
@@ -0,0 +1 @@
+export * from "openclaw/plugin-sdk/diagnostics-prometheus";
--- a/extensions/diagnostics-prometheus/index.ts
+++ b/extensions/diagnostics-prometheus/index.ts
@@ -0,0 +1,20 @@
+import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
+import { createDiagnosticsPrometheusExporter } from "./src/service.js";
+
+const exporter = createDiagnosticsPrometheusExporter();
+
+export default definePluginEntry({
+  id: "diagnostics-prometheus",
+  name: "Diagnostics Prometheus",
+  description: "Expose OpenClaw diagnostics metrics in Prometheus text format",
+  register(api) {
+    api.registerService(exporter.service);
+    api.registerHttpRoute({
+      path: "/api/diagnostics/prometheus",
+      auth: "gateway",
+      match: "exact",
+      gatewayRuntimeScopeSurface: "trusted-operator",
+      handler: exporter.handler,
+    });
+  },
+});
--- a/extensions/diagnostics-prometheus/openclaw.plugin.json
+++ b/extensions/diagnostics-prometheus/openclaw.plugin.json
@@ -0,0 +1,8 @@
+{
+  "id": "diagnostics-prometheus",
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {}
+  }
+}
--- a/extensions/diagnostics-prometheus/package.json
+++ b/extensions/diagnostics-prometheus/package.json
@@ -0,0 +1,24 @@
+{
+  "name": "@openclaw/diagnostics-prometheus",
+  "version": "2026.4.25",
+  "description": "OpenClaw diagnostics Prometheus exporter",
+  "type": "module",
+  "devDependencies": {
+    "@openclaw/plugin-sdk": "workspace:*"
+  },
+  "openclaw": {
+    "extensions": [
+      "./index.ts"
+    ],
+    "compat": {
+      "pluginApi": ">=2026.4.25"
+    },
+    "build": {
+      "openclawVersion": "2026.4.25"
+    },
+    "release": {
+      "publishToClawHub": true,
+      "publishToNpm": true
+    }
+  }
+}
--- a/extensions/diagnostics-prometheus/src/service.test.ts
+++ b/extensions/diagnostics-prometheus/src/service.test.ts
@@ -0,0 +1,169 @@
+import { describe, expect, it, vi } from "vitest";
+import type { DiagnosticEventMetadata, DiagnosticEventPayload } from "../api.js";
+import { createDiagnosticsPrometheusExporter, __test__ } from "./service.js";
+
+const trusted: DiagnosticEventMetadata = Object.freeze({ trusted: true });
+const untrusted: DiagnosticEventMetadata = Object.freeze({ trusted: false });
+
+function baseEvent(): Pick<DiagnosticEventPayload, "seq" | "ts"> {
+  return { seq: 1, ts: 1700000000000 };
+}
+
+describe("diagnostics-prometheus service", () => {
+  it("records trusted run metrics without raw diagnostic identifiers", () => {
+    const store = __test__.createPrometheusMetricStore();
+
+    __test__.recordDiagnosticEvent(
+      store,
+      {
+        ...baseEvent(),
+        type: "run.completed",
+        runId: "run-should-not-export",
+        sessionKey: "session-should-not-export",
+        provider: "openai",
+        model: "gpt-5.4",
+        channel: "discord",
+        trigger: "message",
+        durationMs: 1500,
+        outcome: "completed",
+      },
+      trusted,
+    );
+
+    const rendered = __test__.renderPrometheusMetrics(store);
+
+    expect(rendered).toContain("# TYPE openclaw_run_completed_total counter");
+    expect(rendered).toContain(
+      'openclaw_run_completed_total{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1',
+    );
+    expect(rendered).toContain(
+      'openclaw_run_duration_seconds_sum{channel="discord",model="gpt-5.4",outcome="completed",provider="openai",trigger="message"} 1.5',
+    );
+    expect(rendered).not.toContain("run-should-not-export");
+    expect(rendered).not.toContain("session-should-not-export");
+  });
+
+  it("drops untrusted plugin-emitted diagnostic events", () => {
+    const store = __test__.createPrometheusMetricStore();
+
+    __test__.recordDiagnosticEvent(
+      store,
+      {
+        ...baseEvent(),
+        type: "model.call.completed",
+        runId: "run-1",
+        callId: "call-1",
+        provider: "openai",
+        model: "gpt-5.4",
+        durationMs: 10,
+      },
+      untrusted,
+    );
+
+    expect(__test__.renderPrometheusMetrics(store)).toBe("");
+  });
+
+  it("redacts and bounds label values", () => {
+    const store = __test__.createPrometheusMetricStore();
+
+    __test__.recordDiagnosticEvent(
+      store,
+      {
+        ...baseEvent(),
+        type: "tool.execution.error",
+        toolName: "shell\nbad",
+        durationMs: 25,
+        errorCategory: "Bearer sk-secret-token-value",
+      },
+      trusted,
+    );
+
+    const rendered = __test__.renderPrometheusMetrics(store);
+
+    expect(rendered).toContain(
+      'openclaw_tool_execution_total{error_category="other",outcome="error",params_kind="unknown",tool="tool"} 1',
+    );
+    expect(rendered).not.toContain("Bearer");
+    expect(rendered).not.toContain("sk-secret");
+  });
+
+  it("caps metric series growth and reports dropped series", () => {
+    const store = __test__.createPrometheusMetricStore();
+
+    for (let index = 0; index < 2100; index += 1) {
+      __test__.recordDiagnosticEvent(
+        store,
+        {
+          ...baseEvent(),
+          type: "model.call.completed",
+          runId: `run-${index}`,
+          callId: `call-${index}`,
+          provider: "openai",
+          model: `model.${index}`,
+          durationMs: 10,
+        },
+        trusted,
+      );
+    }
+
+    const rendered = __test__.renderPrometheusMetrics(store);
+
+    expect(rendered).toContain("# TYPE openclaw_prometheus_series_dropped_total counter");
+    expect(rendered).toContain("openclaw_prometheus_series_dropped_total ");
+  });
+
+  it("subscribes to internal diagnostics and renders scrape text", () => {
+    const listeners: Array<
+      (event: DiagnosticEventPayload, metadata: DiagnosticEventMetadata) => void
+    > = [];
+    const emitted: unknown[] = [];
+    const exporter = createDiagnosticsPrometheusExporter();
+    const unsubscribe = vi.fn();
+
+    exporter.service.start({
+      config: {} as never,
+      stateDir: "/tmp/openclaw-prometheus-test",
+      logger: {
+        info: vi.fn(),
+        warn: vi.fn(),
+        error: vi.fn(),
+        debug: vi.fn(),
+      },
+      internalDiagnostics: {
+        emit: (event) => emitted.push(event),
+        onEvent: (listener) => {
+          listeners.push(listener);
+          return unsubscribe;
+        },
+      },
+    });
+
+    listeners[0]?.(
+      {
+        ...baseEvent(),
+        type: "model.usage",
+        provider: "openai",
+        model: "gpt-5.4",
+        usage: { input: 12, output: 3, total: 15 },
+      },
+      trusted,
+    );
+
+    expect(emitted).toContainEqual(
+      expect.objectContaining({
+        type: "telemetry.exporter",
+        exporter: "diagnostics-prometheus",
+        signal: "metrics",
+        status: "started",
+      }),
+    );
+    expect(exporter.render()).toContain(
+      'openclaw_model_tokens_total{agent="unknown",channel="unknown",model="gpt-5.4",provider="openai",token_type="input"} 12',
+    );
+
+    exporter.service.stop?.();
+
+    expect(unsubscribe).toHaveBeenCalledOnce();
+    expect(exporter.render()).toBe("");
+  });
+});
--- a/extensions/diagnostics-prometheus/src/service.ts
+++ b/extensions/diagnostics-prometheus/src/service.ts
@@ -0,0 +1,684 @@
+import type { IncomingMessage, ServerResponse } from "node:http";
+import type {
+  DiagnosticEventMetadata,
+  DiagnosticEventPayload,
+  OpenClawPluginHttpRouteHandler,
+  OpenClawPluginService,
+} from "../api.js";
+import { redactSensitiveText } from "../api.js";
+
+type LabelSet = Record<string, string>;
+
+type CounterSample = {
+  help: string;
+  labels: LabelSet;
+  value: number;
+};
+
+type HistogramSample = {
+  buckets: number[];
+  counts: number[];
+  count: number;
+  help: string;
+  labels: LabelSet;
+  sum: number;
+};
+
+type GaugeSample = {
+  help: string;
+  labels: LabelSet;
+  value: number;
+};
+
+type MetricSnapshot = {
+  counters: Map<string, CounterSample>;
+  gauges: Map<string, GaugeSample>;
+  histograms: Map<string, HistogramSample>;
+};
+
+type PrometheusMetricStore = ReturnType<typeof createPrometheusMetricStore>;
+
+const DURATION_BUCKETS_SECONDS = [
+  0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600,
+];
+const TOKEN_BUCKETS = [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576];
+const BYTE_BUCKETS = [
+  1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864, 268435456, 1073741824,
+  4294967296, 17179869184,
+];
+const LOW_CARDINALITY_VALUE_RE = /^[A-Za-z0-9_.:-]{1,120}$/u;
+const MAX_PROMETHEUS_SERIES = 2048;
+const DROPPED_SERIES_COUNTER_NAME = "openclaw_prometheus_series_dropped_total";
+
+function lowCardinalityLabel(value: string | undefined, fallback = "unknown"): string {
+  if (!value) {
+    return fallback;
+  }
+  const redacted = redactSensitiveText(value.trim());
+  return LOW_CARDINALITY_VALUE_RE.test(redacted) ? redacted : fallback;
+}
+
+function numericValue(value: number | undefined): number | undefined {
+  return typeof value === "number" && Number.isFinite(value) && value >= 0 ? value : undefined;
+}
+
+function seconds(ms: number | undefined): number | undefined {
+  const value = numericValue(ms);
+  return value === undefined ? undefined : value / 1000;
+}
+
+function sortedLabels(labels: LabelSet): [string, string][] {
+  return Object.entries(labels).toSorted(([left], [right]) => left.localeCompare(right));
+}
+
+function metricKey(name: string, labels: LabelSet): string {
+  return `${name}|${JSON.stringify(sortedLabels(labels))}`;
+}
+
+function escapeHelp(value: string): string {
+  return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n");
+}
+
+function escapeLabelValue(value: string): string {
+  return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, '\\"');
+}
+
+function formatLabels(labels: LabelSet): string {
+  const entries = sortedLabels(labels);
+  if (entries.length === 0) {
+    return "";
+  }
+  return `{${entries.map(([key, value]) => `${key}="${escapeLabelValue(value)}"`).join(",")}}`;
+}
+
+function formatPrometheusNumber(value: number): string {
+  if (!Number.isFinite(value)) {
+    return "0";
+  }
+  return Number.isInteger(value) ? String(value) : String(Number(value.toPrecision(12)));
+}
+
+function createPrometheusMetricStore() {
+  const counters = new Map<string, CounterSample>();
+  const gauges = new Map<string, GaugeSample>();
+  const histograms = new Map<string, HistogramSample>();
+  let droppedSeries = 0;
+
+  const canCreateSeries = <T>(map: Map<string, T>, key: string, metricName: string): boolean => {
+    if (map.has(key)) {
+      return true;
+    }
+    if (metricName === DROPPED_SERIES_COUNTER_NAME) {
+      return true;
+    }
+    if (counters.size + gauges.size + histograms.size < MAX_PROMETHEUS_SERIES) {
+      return true;
+    }
+    droppedSeries += 1;
+    return false;
+  };
+
+  const counter = (name: string, help: string, labels: LabelSet, amount = 1) => {
+    if (!Number.isFinite(amount) || amount <= 0) {
+      return;
+    }
+    const key = metricKey(name, labels);
+    if (!canCreateSeries(counters, key, name)) {
+      return;
+    }
+    const existing = counters.get(key);
+    if (existing) {
+      existing.value += amount;
+      return;
+    }
+    counters.set(key, { help, labels, value: amount });
+  };
+
+  const gauge = (name: string, help: string, labels: LabelSet, value: number | undefined) => {
+    if (value === undefined || !Number.isFinite(value)) {
+      return;
+    }
+    const key = metricKey(name, labels);
+    if (!canCreateSeries(gauges, key, name)) {
+      return;
+    }
+    gauges.set(key, { help, labels, value });
+  };
+
+  const histogram = (
+    name: string,
+    help: string,
+    labels: LabelSet,
+    value: number | undefined,
+    buckets = DURATION_BUCKETS_SECONDS,
+  ) => {
+    if (value === undefined || !Number.isFinite(value) || value < 0) {
+      return;
+    }
+    const key = metricKey(name, labels);
+    if (!canCreateSeries(histograms, key, name)) {
+      return;
+    }
+    let sample = histograms.get(key);
+    if (!sample) {
+      sample = {
+        buckets,
+        counts: buckets.map(() => 0),
+        count: 0,
+        help,
+        labels,
+        sum: 0,
+      };
+      histograms.set(key, sample);
+    }
+    sample.count += 1;
+    sample.sum += value;
+    for (let index = 0; index < sample.buckets.length; index += 1) {
+      const bucket = sample.buckets[index];
+      if (bucket !== undefined && value <= bucket) {
+        sample.counts[index] = (sample.counts[index] ?? 0) + 1;
+      }
+    }
+  };
+
+  const snapshot = (): MetricSnapshot => {
+    const counterSnapshot = new Map(counters);
+    if (droppedSeries > 0) {
+      counterSnapshot.set(metricKey(DROPPED_SERIES_COUNTER_NAME, {}), {
+        help: "Prometheus metric series dropped because the exporter series cap was reached.",
+        labels: {},
+        value: droppedSeries,
+      });
+    }
+    return {
+      counters: counterSnapshot,
+      gauges: new Map(gauges),
+      histograms: new Map(histograms),
+    };
+  };
+
+  const reset = () => {
+    counters.clear();
+    gauges.clear();
+    histograms.clear();
+    droppedSeries = 0;
+  };
+
+  return { counter, gauge, histogram, reset, snapshot };
+}
+
+function safeErrorMessage(err: unknown): string {
+  const message = err instanceof Error ? (err.message ?? err.name) : String(err);
+  return redactSensitiveText(message)
+    .replaceAll("\u0000", " ")
+    .replace(/[\r\n\t\u2028\u2029]/gu, " ")
+    .slice(0, 500);
+}
+
+function renderPrometheusMetrics(store: PrometheusMetricStore): string {
+  const snapshot = store.snapshot();
+  const lines: string[] = [];
+  const emitted = new Set<string>();
+
+  const emitHeader = (name: string, type: "counter" | "gauge" | "histogram", help: string) => {
+    if (emitted.has(name)) {
+      return;
+    }
+    emitted.add(name);
+    lines.push(`# HELP ${name} ${escapeHelp(help)}`);
+    lines.push(`# TYPE ${name} ${type}`);
+  };
+
+  const counterEntries = [...snapshot.counters.entries()].toSorted(([left], [right]) =>
+    left.localeCompare(right),
+  );
+  for (const [key, sample] of counterEntries) {
+    const name = key.split("|", 1)[0] ?? "";
+    emitHeader(name, "counter", sample.help);
+    lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`);
+  }
+
+  const gaugeEntries = [...snapshot.gauges.entries()].toSorted(([left], [right]) =>
+    left.localeCompare(right),
+  );
+  for (const [key, sample] of gaugeEntries) {
+    const name = key.split("|", 1)[0] ?? "";
+    emitHeader(name, "gauge", sample.help);
+    lines.push(`${name}${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.value)}`);
+  }
+
+  const histogramEntries = [...snapshot.histograms.entries()].toSorted(([left], [right]) =>
+    left.localeCompare(right),
+  );
+  for (const [key, sample] of histogramEntries) {
+    const name = key.split("|", 1)[0] ?? "";
+    emitHeader(name, "histogram", sample.help);
+    for (let index = 0; index < sample.buckets.length; index += 1) {
+      const bucket = sample.buckets[index];
+      if (bucket === undefined) {
+        continue;
+      }
+      lines.push(
+        `${name}_bucket${formatLabels({ ...sample.labels, le: String(bucket) })} ${formatPrometheusNumber(sample.counts[index] ?? 0)}`,
+      );
+    }
+    lines.push(
+      `${name}_bucket${formatLabels({ ...sample.labels, le: "+Inf" })} ${formatPrometheusNumber(sample.count)}`,
+    );
+    lines.push(`${name}_sum${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.sum)}`);
+    lines.push(
+      `${name}_count${formatLabels(sample.labels)} ${formatPrometheusNumber(sample.count)}`,
+    );
+  }
+
+  lines.push("");
+  return lines.join("\n");
+}
+
+function runLabels(evt: {
+  channel?: string;
+  model?: string;
+  outcome?: string;
+  provider?: string;
+  trigger?: string;
+}): LabelSet {
+  return {
+    channel: lowCardinalityLabel(evt.channel),
+    model: lowCardinalityLabel(evt.model),
+    outcome: lowCardinalityLabel(evt.outcome, "unknown"),
+    provider: lowCardinalityLabel(evt.provider),
+    trigger: lowCardinalityLabel(evt.trigger),
+  };
+}
+
+function modelCallLabels(evt: {
+  api?: string;
+  errorCategory?: string;
+  model?: string;
+  provider?: string;
+  transport?: string;
+  type: string;
+}): LabelSet {
+  return {
+    api: lowCardinalityLabel(evt.api),
+    error_category:
+      evt.type === "model.call.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none",
+    model: lowCardinalityLabel(evt.model),
+    outcome: evt.type === "model.call.error" ? "error" : "completed",
+    provider: lowCardinalityLabel(evt.provider),
+    transport: lowCardinalityLabel(evt.transport),
+  };
+}
+
+function toolExecutionLabels(evt: {
+  errorCategory?: string;
+  paramsSummary?: { kind: string };
+  toolName: string;
+  type: string;
+}): LabelSet {
+  return {
+    error_category:
+      evt.type === "tool.execution.error"
+        ? lowCardinalityLabel(evt.errorCategory, "other")
+        : "none",
+    outcome: evt.type === "tool.execution.error" ? "error" : "completed",
+    params_kind: lowCardinalityLabel(evt.paramsSummary?.kind),
+    tool: lowCardinalityLabel(evt.toolName, "tool"),
+  };
+}
+
+function harnessLabels(evt: {
+  channel?: string;
+  errorCategory?: string;
+  harnessId: string;
+  model?: string;
+  outcome?: string;
+  phase?: string;
+  pluginId?: string;
+  provider?: string;
+  type: string;
+}): LabelSet {
+  return {
+    channel: lowCardinalityLabel(evt.channel),
+    error_category:
+      evt.type === "harness.run.error" ? lowCardinalityLabel(evt.errorCategory, "other") : "none",
+    harness: lowCardinalityLabel(evt.harnessId),
+    model: lowCardinalityLabel(evt.model),
+    outcome: evt.type === "harness.run.error" ? "error" : lowCardinalityLabel(evt.outcome),
+    phase: evt.type === "harness.run.error" ? lowCardinalityLabel(evt.phase) : "none",
+    plugin: lowCardinalityLabel(evt.pluginId),
+    provider: lowCardinalityLabel(evt.provider),
+  };
+}
+
+function recordModelUsage(
+  store: PrometheusMetricStore,
+  evt: Extract<DiagnosticEventPayload, { type: "model.usage" }>,
+) {
+  const labels = {
+    agent: lowCardinalityLabel(evt.agentId),
+    channel: lowCardinalityLabel(evt.channel),
+    model: lowCardinalityLabel(evt.model),
+    provider: lowCardinalityLabel(evt.provider),
+  };
+  const usage = evt.usage;
+  const recordTokens = (tokenType: string, value: number | undefined) => {
+    const amount = numericValue(value);
+    if (amount === undefined || amount === 0) {
+      return;
+    }
+    store.counter(
+      "openclaw_model_tokens_total",
+      "Model tokens reported by diagnostic usage events.",
+      {
+        ...labels,
+        token_type: tokenType,
+      },
+      amount,
+    );
+    if (tokenType === "input" || tokenType === "output") {
+      store.histogram(
+        "openclaw_gen_ai_client_token_usage",
+        "GenAI token usage distribution for input and output tokens.",
+        {
+          model: labels.model,
+          provider: labels.provider,
+          token_type: tokenType,
+        },
+        amount,
+        TOKEN_BUCKETS,
+      );
+    }
+  };
+
+  recordTokens("input", usage.input);
+  recordTokens("output", usage.output);
+  recordTokens("cache_read", usage.cacheRead);
+  recordTokens("cache_write", usage.cacheWrite);
+  recordTokens("prompt", usage.promptTokens);
+  recordTokens("total", usage.total);
+
+  store.counter(
+    "openclaw_model_cost_usd_total",
+    "Estimated model cost in USD reported by diagnostic usage events.",
+    labels,
+    numericValue(evt.costUsd) ?? 0,
+  );
+  store.histogram(
+    "openclaw_model_usage_duration_seconds",
+    "Model usage event duration in seconds.",
+    labels,
+    seconds(evt.durationMs),
+  );
+}
+
+function recordDiagnosticEvent(
+  store: PrometheusMetricStore,
+  evt: DiagnosticEventPayload,
+  metadata: DiagnosticEventMetadata,
+): void {
+  if (!metadata.trusted) {
+    return;
+  }
+
+  switch (evt.type) {
+    case "model.usage":
+      recordModelUsage(store, evt);
+      return;
+    case "run.completed":
+      store.histogram(
+        "openclaw_run_duration_seconds",
+        "Agent run duration in seconds.",
+        runLabels(evt),
+        seconds(evt.durationMs),
+      );
+      store.counter(
+        "openclaw_run_completed_total",
+        "Agent runs completed by outcome.",
+        runLabels(evt),
+      );
+      return;
+    case "model.call.completed":
+    case "model.call.error":
+      store.histogram(
+        "openclaw_model_call_duration_seconds",
+        "Provider model call duration in seconds.",
+        modelCallLabels(evt),
+        seconds(evt.durationMs),
+      );
+      store.counter(
+        "openclaw_model_call_total",
+        "Provider model calls completed by outcome.",
+        modelCallLabels(evt),
+      );
+      return;
+    case "tool.execution.completed":
+    case "tool.execution.error":
+      store.histogram(
+        "openclaw_tool_execution_duration_seconds",
+        "Tool execution duration in seconds.",
+        toolExecutionLabels(evt),
+        seconds(evt.durationMs),
+      );
+      store.counter(
+        "openclaw_tool_execution_total",
+        "Tool executions completed by outcome.",
+        toolExecutionLabels(evt),
+      );
+      return;
+    case "harness.run.completed":
+    case "harness.run.error":
+      store.histogram(
+        "openclaw_harness_run_duration_seconds",
+        "Agent harness run duration in seconds.",
+        harnessLabels(evt),
+        seconds(evt.durationMs),
+      );
+      store.counter(
+        "openclaw_harness_run_total",
+        "Agent harness runs completed by outcome.",
+        harnessLabels(evt),
+      );
+      return;
+    case "message.processed":
+      store.counter("openclaw_message_processed_total", "Inbound messages processed by outcome.", {
+        channel: lowCardinalityLabel(evt.channel),
+        outcome: evt.outcome,
+        reason: lowCardinalityLabel(evt.reason, "none"),
+      });
+      store.histogram(
+        "openclaw_message_processed_duration_seconds",
+        "Inbound message processing duration in seconds.",
+        {
+          channel: lowCardinalityLabel(evt.channel),
+          outcome: evt.outcome,
+          reason: lowCardinalityLabel(evt.reason, "none"),
+        },
+        seconds(evt.durationMs),
+      );
+      return;
+    case "message.delivery.completed":
+    case "message.delivery.error":
+      store.counter(
+        "openclaw_message_delivery_total",
+        "Outbound message delivery attempts by outcome.",
+        {
+          channel: lowCardinalityLabel(evt.channel),
+          delivery_kind: evt.deliveryKind,
+          error_category:
+            evt.type === "message.delivery.error"
+              ? lowCardinalityLabel(evt.errorCategory, "other")
+              : "none",
+          outcome: evt.type === "message.delivery.error" ? "error" : "completed",
+        },
+      );
+      store.histogram(
+        "openclaw_message_delivery_duration_seconds",
+        "Outbound message delivery duration in seconds.",
+        {
+          channel: lowCardinalityLabel(evt.channel),
+          delivery_kind: evt.deliveryKind,
+          error_category:
+            evt.type === "message.delivery.error"
+              ? lowCardinalityLabel(evt.errorCategory, "other")
+              : "none",
+          outcome: evt.type === "message.delivery.error" ? "error" : "completed",
+        },
+        seconds(evt.durationMs),
+      );
+      return;
+    case "queue.lane.enqueue":
+    case "queue.lane.dequeue":
+      store.gauge(
+        "openclaw_queue_lane_size",
+        "Current diagnostic queue lane size.",
+        {
+          lane: lowCardinalityLabel(evt.lane),
+        },
+        numericValue(evt.queueSize),
+      );
+      if (evt.type === "queue.lane.dequeue") {
+        store.histogram(
+          "openclaw_queue_lane_wait_seconds",
+          "Queue lane wait time in seconds.",
+          { lane: lowCardinalityLabel(evt.lane) },
+          seconds(evt.waitMs),
+        );
+      }
+      return;
+    case "session.state":
+      store.counter("openclaw_session_state_total", "Session state observations.", {
+        reason: lowCardinalityLabel(evt.reason, "none"),
+        state: evt.state,
+      });
+      if (evt.queueDepth !== undefined) {
+        store.gauge(
+          "openclaw_session_queue_depth",
+          "Latest observed session queue depth.",
+          {
+            state: evt.state,
+          },
+          numericValue(evt.queueDepth),
+        );
+      }
+      return;
+    case "diagnostic.memory.sample":
+      store.gauge(
+        "openclaw_memory_bytes",
+        "Latest process memory usage by memory kind.",
+        { kind: "rss" },
+        evt.memory.rssBytes,
+      );
+      store.gauge(
+        "openclaw_memory_bytes",
+        "Latest process memory usage by memory kind.",
+        { kind: "heap_total" },
+        evt.memory.heapTotalBytes,
+      );
+      store.gauge(
+        "openclaw_memory_bytes",
+        "Latest process memory usage by memory kind.",
+        { kind: "heap_used" },
+        evt.memory.heapUsedBytes,
+      );
+      store.histogram(
+        "openclaw_memory_rss_bytes",
+        "RSS memory sample distribution in bytes.",
+        {},
+        numericValue(evt.memory.rssBytes),
+        BYTE_BUCKETS,
+      );
+      return;
+    case "diagnostic.memory.pressure":
+      store.counter(
+        "openclaw_memory_pressure_total",
+        "Memory pressure events by level and reason.",
+        {
+          level: evt.level,
+          reason: evt.reason,
+        },
+      );
+      return;
+    case "telemetry.exporter":
+      store.counter("openclaw_telemetry_exporter_total", "Telemetry exporter lifecycle events.", {
+        exporter: lowCardinalityLabel(evt.exporter),
+        reason: lowCardinalityLabel(evt.reason, "none"),
+        signal: evt.signal,
+        status: evt.status,
+      });
+      return;
+    default:
+      return;
+  }
+}
+
+function createMetricsHandler(store: PrometheusMetricStore): OpenClawPluginHttpRouteHandler {
+  return (req: IncomingMessage, res: ServerResponse) => {
+    if (req.method !== "GET" && req.method !== "HEAD") {
+      res.statusCode = 405;
+      res.setHeader("Allow", "GET, HEAD");
+      res.end("Method Not Allowed");
+      return true;
+    }
+
+    const body = renderPrometheusMetrics(store);
+    res.statusCode = 200;
+    res.setHeader("Cache-Control", "no-store");
+    res.setHeader("Content-Type", "text/plain; version=0.0.4; charset=utf-8");
+    if (req.method === "HEAD") {
+      res.end();
+      return true;
+    }
+    res.end(body);
+    return true;
+  };
+}
+
+export function createDiagnosticsPrometheusExporter() {
+  const store = createPrometheusMetricStore();
+  let unsubscribe: (() => void) | undefined;
+
+  const service = {
+    id: "diagnostics-prometheus",
+    start(ctx) {
+      const subscribe = ctx.internalDiagnostics?.onEvent;
+      if (!subscribe) {
+        ctx.logger.error("diagnostics-prometheus: internal diagnostics capability unavailable");
+        return;
+      }
+      unsubscribe = subscribe((event, metadata) => {
+        try {
+          recordDiagnosticEvent(store, event, metadata);
+        } catch (err) {
+          ctx.logger.error(
+            `diagnostics-prometheus: event handler failed (${event.type}): ${safeErrorMessage(err)}`,
+          );
+        }
+      });
+      ctx.internalDiagnostics?.emit({
+        type: "telemetry.exporter",
+        exporter: "diagnostics-prometheus",
+        signal: "metrics",
+        status: "started",
+        reason: "configured",
+      });
+    },
+    stop() {
+      unsubscribe?.();
+      unsubscribe = undefined;
+      store.reset();
+    },
+  } satisfies OpenClawPluginService;
+
+  return {
+    handler: createMetricsHandler(store),
+    render: () => renderPrometheusMetrics(store),
+    service,
+  };
+}
+
+export const __test__ = {
+  createPrometheusMetricStore,
+  recordDiagnosticEvent,
+  renderPrometheusMetrics,
+};
--- a/extensions/diagnostics-prometheus/tsconfig.json
+++ b/extensions/diagnostics-prometheus/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "extends": "../tsconfig.package-boundary.base.json",
+  "compilerOptions": {
+    "rootDir": "."
+  },
+  "include": ["./*.ts", "./src/**/*.ts"],
+  "exclude": [
+    "./**/*.test.ts",
+    "./dist/**",
+    "./node_modules/**",
+    "./src/test-support/**",
+    "./src/**/*test-helpers.ts",
+    "./src/**/*test-harness.ts",
+    "./src/**/*test-support.ts"
+  ]
+}
--- a/package.json
+++ b/package.json
@@ -596,6 +596,10 @@
      "types": "./dist/plugin-sdk/diagnostics-otel.d.ts",
      "default": "./dist/plugin-sdk/diagnostics-otel.js"
    },
+    "./plugin-sdk/diagnostics-prometheus": {
+      "types": "./dist/plugin-sdk/diagnostics-prometheus.d.ts",
+      "default": "./dist/plugin-sdk/diagnostics-prometheus.js"
+    },
    "./plugin-sdk/diffs": {
      "types": "./dist/plugin-sdk/diffs.d.ts",
      "default": "./dist/plugin-sdk/diffs.js"
--- a/scripts/lib/plugin-sdk-entrypoints.json
+++ b/scripts/lib/plugin-sdk-entrypoints.json
@@ -134,6 +134,7 @@
  "device-bootstrap",
  "diagnostic-runtime",
  "diagnostics-otel",
+  "diagnostics-prometheus",
  "diffs",
  "error-runtime",
  "extension-shared",
--- a/src/channels/plugins/contracts/channel-import-guardrails.test.ts
+++ b/src/channels/plugins/contracts/channel-import-guardrails.test.ts
@@ -191,6 +191,7 @@ const LOCAL_EXTENSION_API_BARREL_GUARDS = [
  "bluebubbles",
  "device-pair",
  "diagnostics-otel",
+  "diagnostics-prometheus",
  "discord",
  "diffs",
  "feishu",
--- a/src/plugin-sdk/diagnostics-prometheus.ts
+++ b/src/plugin-sdk/diagnostics-prometheus.ts
@@ -0,0 +1,15 @@
+// Narrow plugin-sdk surface for the bundled diagnostics-prometheus plugin.
+// Keep this list additive and scoped to the bundled diagnostics-prometheus surface.
+
+export type {
+  DiagnosticEventMetadata,
+  DiagnosticEventPayload,
+} from "../infra/diagnostic-events.js";
+export { redactSensitiveText } from "../logging/redact.js";
+export { emptyPluginConfigSchema } from "../plugins/config-schema.js";
+export type {
+  OpenClawPluginApi,
+  OpenClawPluginHttpRouteHandler,
+  OpenClawPluginService,
+  OpenClawPluginServiceContext,
+} from "../plugins/types.js";
--- a/src/plugins/services.test.ts
+++ b/src/plugins/services.test.ts
@@ -180,7 +180,7 @@ describe("startPluginServices", () => {
    expect(stopThrows).toHaveBeenCalledOnce();
  });

-  it("grants internal diagnostics only to the bundled diagnostics OTEL service", async () => {
+  it("grants internal diagnostics only to bundled diagnostics exporter services", async () => {
    const contexts: OpenClawPluginServiceContext[] = [];
    const diagnosticsService = createTrackingService("diagnostics-otel", { contexts });
    await startPluginServices({
@@ -191,6 +191,18 @@ describe("startPluginServices", () => {
    expect(contexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function");
    expect(contexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function");

+    const prometheusContexts: OpenClawPluginServiceContext[] = [];
+    const prometheusService = createTrackingService("diagnostics-prometheus", {
+      contexts: prometheusContexts,
+    });
+    await startPluginServices({
+      registry: createRegistry([prometheusService], "diagnostics-prometheus", "bundled"),
+      config: createServiceConfig(),
+    });
+
+    expect(prometheusContexts[0]?.internalDiagnostics?.onEvent).toBeTypeOf("function");
+    expect(prometheusContexts[0]?.internalDiagnostics?.emit).toBeTypeOf("function");
+
    const untrustedContexts: OpenClawPluginServiceContext[] = [];
    const untrustedService = createTrackingService("diagnostics-otel", {
      contexts: untrustedContexts,
--- a/src/plugins/services.ts
+++ b/src/plugins/services.ts
@@ -24,14 +24,18 @@ function createServiceContext(params: {
  workspaceDir?: string;
  service?: PluginServiceRegistration;
 }): OpenClawPluginServiceContext {
+  const grantsInternalDiagnostics =
+    params.service?.origin === "bundled" &&
+    params.service.pluginId === params.service.service.id &&
+    (params.service.service.id === "diagnostics-otel" ||
+      params.service.service.id === "diagnostics-prometheus");
+
  return {
    config: params.config,
    workspaceDir: params.workspaceDir,
    stateDir: STATE_DIR,
    logger: createPluginLogger(),
-    ...(params.service?.origin === "bundled" &&
-    params.service.pluginId === "diagnostics-otel" &&
-    params.service.service.id === "diagnostics-otel"
+    ...(grantsInternalDiagnostics
      ? {
          internalDiagnostics: {
            emit: emitTrustedDiagnosticEvent,
				`@@ -0,0 +1 @@`
				`export * from "openclaw/plugin-sdk/diagnostics-prometheus";`