feat(diagnostics): add harness lifecycle telemetry

2026-05-06 06:50:43 +00:00 · 2026-04-25 23:34:03 -07:00
parent 8bbb143ab8
commit 82ddcf24f5
10 changed files with 516 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
 - Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc.
 - Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc.
 - Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
+- Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc.
 - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
 - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
 - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -59,9 +59,9 @@ pnpm qa:otel:smoke
 That script starts a local OTLP/HTTP trace receiver, runs the
 `otel-trace-smoke` QA scenario with the `diagnostics-otel` plugin enabled, then
 decodes the exported protobuf spans and asserts the release-critical shape:
-`openclaw.run`, `openclaw.model.call`, `openclaw.context.assembled`, and
-`openclaw.message.delivery` must be present; model calls must not export
-`StreamAbandoned` on successful turns; raw diagnostic IDs and
+`openclaw.run`, `openclaw.harness.run`, `openclaw.model.call`,
+`openclaw.context.assembled`, and `openclaw.message.delivery` must be present;
+model calls must not export `StreamAbandoned` on successful turns; raw diagnostic IDs and
 `openclaw.content.*` attributes must stay out of the trace. It writes
 `otel-smoke-summary.json` next to the QA suite artifacts.

--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -1140,6 +1140,28 @@ describe("diagnostics-otel service", () => {
        traceFlags: "01",
      },
    });
+    emitDiagnosticEvent({
+      type: "harness.run.completed",
+      runId: "run-1",
+      sessionKey: "session-key",
+      sessionId: "session-1",
+      provider: "codex",
+      model: "gpt-5.4",
+      channel: "qa",
+      harnessId: "codex",
+      pluginId: "codex-plugin",
+      outcome: "completed",
+      durationMs: 90,
+      resultClassification: "reasoning-only",
+      yieldDetected: true,
+      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
+      trace: {
+        traceId: TRACE_ID,
+        spanId: GRANDCHILD_SPAN_ID,
+        parentSpanId: CHILD_SPAN_ID,
+        traceFlags: "01",
+      },
+    });
    emitDiagnosticEvent({
      type: "tool.execution.error",
      runId: "run-1",
@@ -1160,7 +1182,12 @@ describe("diagnostics-otel service", () => {

    const spanNames = telemetryState.tracer.startSpan.mock.calls.map((call) => call[0]);
    expect(spanNames).toEqual(
-      expect.arrayContaining(["openclaw.run", "openclaw.model.call", "openclaw.tool.execution"]),
+      expect.arrayContaining([
+        "openclaw.run",
+        "openclaw.model.call",
+        "openclaw.harness.run",
+        "openclaw.tool.execution",
+      ]),
    );

    const runCall = telemetryState.tracer.startSpan.mock.calls.find(
@@ -1207,6 +1234,36 @@ describe("diagnostics-otel service", () => {
    });
    expect(modelCall?.[2]).toBeUndefined();

+    const harnessCall = telemetryState.tracer.startSpan.mock.calls.find(
+      (call) => call[0] === "openclaw.harness.run",
+    );
+    expect(harnessCall?.[1]).toMatchObject({
+      attributes: {
+        "openclaw.harness.id": "codex",
+        "openclaw.harness.plugin": "codex-plugin",
+        "openclaw.outcome": "completed",
+        "openclaw.provider": "codex",
+        "openclaw.model": "gpt-5.4",
+        "openclaw.channel": "qa",
+        "openclaw.harness.result_classification": "reasoning-only",
+        "openclaw.harness.yield_detected": true,
+        "openclaw.harness.items.started": 3,
+        "openclaw.harness.items.completed": 2,
+        "openclaw.harness.items.active": 1,
+      },
+      startTime: expect.any(Number),
+    });
+    expect(harnessCall?.[1]).toEqual({
+      attributes: expect.not.objectContaining({
+        "openclaw.runId": expect.anything(),
+        "openclaw.sessionId": expect.anything(),
+        "openclaw.sessionKey": expect.anything(),
+        "openclaw.traceId": expect.anything(),
+      }),
+      startTime: expect.any(Number),
+    });
+    expect(harnessCall?.[2]).toBeUndefined();
+
    const toolCall = telemetryState.tracer.startSpan.mock.calls.find(
      (call) => call[0] === "openclaw.tool.execution",
    );
@@ -1244,6 +1301,25 @@ describe("diagnostics-otel service", () => {
        "openclaw.runId": expect.anything(),
      }),
    );
+    expect(
+      telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
+    ).toHaveBeenCalledWith(
+      90,
+      expect.objectContaining({
+        "openclaw.harness.id": "codex",
+        "openclaw.harness.plugin": "codex-plugin",
+        "openclaw.outcome": "completed",
+      }),
+    );
+    expect(
+      telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
+    ).toHaveBeenCalledWith(
+      90,
+      expect.not.objectContaining({
+        "openclaw.runId": expect.anything(),
+        "openclaw.sessionKey": expect.anything(),
+      }),
+    );
    expect(
      telemetryState.histograms.get("openclaw.tool.execution.duration_ms")?.record,
    ).toHaveBeenCalledWith(
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -81,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
  DiagnosticEventPayload,
  { type: "model.call.completed" | "model.call.error" }
 >;
+type HarnessRunLifecycleDiagnosticEvent = Extract<
+  DiagnosticEventPayload,
+  { type: "harness.run.completed" | "harness.run.error" }
+>;
 type TelemetryExporterDiagnosticEvent = Extract<
  DiagnosticEventPayload,
  { type: "telemetry.exporter" }
@@ -720,6 +724,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        unit: "ms",
        description: "Agent run duration",
      });
+      const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
+        unit: "ms",
+        description: "Agent harness lifecycle duration",
+      });
      const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
        unit: "1",
        description: "Context window size and usage",
@@ -1426,6 +1434,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        span.end(evt.ts);
      };

+      const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({
+        "openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
+        "openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
+        "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
+        "openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
+        "openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
+        ...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
+      });
+
+      const recordHarnessRunCompleted = (
+        evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
+        metadata: DiagnosticEventMetadata,
+      ) => {
+        harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
+        if (!tracesEnabled) {
+          return;
+        }
+        const spanAttrs: Record<string, string | number | boolean> = {
+          ...harnessRunMetricAttrs(evt),
+        };
+        if (evt.resultClassification) {
+          spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
+            evt.resultClassification,
+          );
+        }
+        if (typeof evt.yieldDetected === "boolean") {
+          spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
+        }
+        if (evt.itemLifecycle) {
+          spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
+          spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
+          spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
+        }
+        const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
+          parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
+          endTimeMs: evt.ts,
+        });
+        if (evt.outcome === "error") {
+          span.setStatus({
+            code: SpanStatusCode.ERROR,
+            message: "error",
+          });
+        }
+        span.end(evt.ts);
+      };
+
+      const recordHarnessRunError = (
+        evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
+        metadata: DiagnosticEventMetadata,
+      ) => {
+        const errorType = lowCardinalityAttr(evt.errorCategory, "other");
+        const attrs = {
+          ...harnessRunMetricAttrs(evt),
+          "openclaw.harness.phase": evt.phase,
+          "openclaw.errorCategory": errorType,
+        };
+        harnessDurationHistogram.record(evt.durationMs, attrs);
+        if (!tracesEnabled) {
+          return;
+        }
+        const spanAttrs: Record<string, string | number | boolean> = {
+          ...attrs,
+          "error.type": errorType,
+          ...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
+        };
+        const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
+          parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
+          endTimeMs: evt.ts,
+        });
+        span.setStatus({
+          code: SpanStatusCode.ERROR,
+          message: errorType,
+        });
+        span.end(evt.ts);
+      };
+
      const recordContextAssembled = (
        evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
        metadata: DiagnosticEventMetadata,
@@ -1746,6 +1830,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
            case "run.completed":
              recordRunCompleted(evt, metadata);
              return;
+            case "harness.run.completed":
+              recordHarnessRunCompleted(evt, metadata);
+              return;
+            case "harness.run.error":
+              recordHarnessRunError(evt, metadata);
+              return;
            case "context.assembled":
              recordContextAssembled(evt, metadata);
              return;
@@ -1781,6 +1871,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
              return;
            case "tool.execution.started":
            case "run.started":
+            case "harness.run.started":
            case "model.call.started":
            case "payload.large":
              return;
--- a/qa/scenarios/runtime/otel-trace-smoke.md
+++ b/qa/scenarios/runtime/otel-trace-smoke.md
@@ -13,6 +13,7 @@ objective: Verify a QA-lab gateway run emits bounded OpenTelemetry trace spans t
 successCriteria:
  - The diagnostics-otel plugin starts with trace export enabled.
  - A minimal QA-channel agent turn completes.
+  - The trace includes the selected agent harness lifecycle span.
  - The run emits low-cardinality OpenTelemetry trace spans without content or raw diagnostic identifiers.
 plugins:
  - diagnostics-otel
@@ -33,6 +34,7 @@ docsRefs:
  - docs/concepts/qa-e2e-automation.md
 codeRefs:
  - extensions/diagnostics-otel/src/service.ts
+  - src/agents/harness/v2.ts
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
--- a/scripts/qa-otel-smoke.ts
+++ b/scripts/qa-otel-smoke.ts
@@ -80,6 +80,7 @@ type CapturedSpan = {
 const DEFAULT_SCENARIO_ID = "otel-trace-smoke";
 const REQUIRED_SPAN_NAMES = [
  "openclaw.run",
+  "openclaw.harness.run",
  "openclaw.model.call",
  "openclaw.context.assembled",
  "openclaw.message.delivery",
--- a/src/agents/harness/v2.test.ts
+++ b/src/agents/harness/v2.test.ts
@@ -1,5 +1,11 @@
 import type { Api, Model } from "@mariozechner/pi-ai";
-import { describe, expect, it, vi } from "vitest";
+import { afterEach, describe, expect, it, vi } from "vitest";
+import {
+  onInternalDiagnosticEvent,
+  resetDiagnosticEventsForTest,
+  type DiagnosticEventMetadata,
+  type DiagnosticEventPayload,
+} from "../../infra/diagnostic-events.js";
 import type { EmbeddedRunAttemptResult } from "../pi-embedded-runner/run/types.js";
 import type { AgentHarness, AgentHarnessAttemptParams } from "./types.js";
 import type { AgentHarnessV2 } from "./v2.js";
@@ -9,6 +15,7 @@ function createAttemptParams(): AgentHarnessAttemptParams {
  return {
    prompt: "hello",
    sessionId: "session-1",
+    sessionKey: "session-key",
    runId: "run-1",
    sessionFile: "/tmp/session.jsonl",
    workspaceDir: "/tmp/workspace",
@@ -19,9 +26,19 @@ function createAttemptParams(): AgentHarnessAttemptParams {
    authStorage: {} as never,
    modelRegistry: {} as never,
    thinkLevel: "low",
+    messageChannel: "qa",
+    trigger: "manual",
  } as AgentHarnessAttemptParams;
 }

+function createDiagnosticTrace() {
+  return {
+    traceId: "11111111111111111111111111111111",
+    spanId: "2222222222222222",
+    traceFlags: "01",
+  };
+}
+
 function createAttemptResult(): EmbeddedRunAttemptResult {
  return {
    aborted: false,
@@ -32,6 +49,7 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
    promptError: null,
    promptErrorSource: null,
    sessionIdUsed: "session-1",
+    diagnosticTrace: createDiagnosticTrace(),
    messagesSnapshot: [],
    assistantTexts: ["ok"],
    toolMetas: [],
@@ -46,7 +64,28 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
  };
 }

+async function flushDiagnosticEvents(): Promise<void> {
+  await new Promise<void>((resolve) => setImmediate(resolve));
+}
+
+function captureDiagnosticEvents(): {
+  events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }>;
+  unsubscribe: () => void;
+} {
+  const events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }> = [];
+  const unsubscribe = onInternalDiagnosticEvent((event, metadata) => {
+    if (event.type.startsWith("harness.run.")) {
+      events.push({ event, metadata });
+    }
+  });
+  return { events, unsubscribe };
+}
+
 describe("AgentHarness V2 compatibility adapter", () => {
+  afterEach(() => {
+    resetDiagnosticEventsForTest();
+  });
+
  it("executes prepare/start/send/outcome/cleanup as one bounded lifecycle", async () => {
    const params = createAttemptParams();
    const result = createAttemptResult();
@@ -102,6 +141,112 @@ describe("AgentHarness V2 compatibility adapter", () => {
    ]);
  });

+  it("emits trusted harness lifecycle diagnostics for successful attempts", async () => {
+    resetDiagnosticEventsForTest();
+    const params = createAttemptParams();
+    const result = {
+      ...createAttemptResult(),
+      agentHarnessResultClassification: "reasoning-only",
+      yieldDetected: true,
+      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
+    } as EmbeddedRunAttemptResult;
+    const harness: AgentHarnessV2 = {
+      id: "codex",
+      label: "Codex",
+      pluginId: "codex-plugin",
+      supports: () => ({ supported: true }),
+      prepare: async () => ({
+        harnessId: "codex",
+        label: "Codex",
+        pluginId: "codex-plugin",
+        params,
+        lifecycleState: "prepared",
+      }),
+      start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
+      send: async () => result,
+      resolveOutcome: async (_session, rawResult) => rawResult,
+      cleanup: async () => {},
+    };
+    const diagnostics = captureDiagnosticEvents();
+    try {
+      await runAgentHarnessV2LifecycleAttempt(harness, params);
+      await flushDiagnosticEvents();
+    } finally {
+      diagnostics.unsubscribe();
+    }
+
+    expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
+      "harness.run.started",
+      "harness.run.completed",
+    ]);
+    expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
+    expect(diagnostics.events[1]?.event).toMatchObject({
+      type: "harness.run.completed",
+      runId: "run-1",
+      sessionKey: "session-key",
+      sessionId: "session-1",
+      provider: "codex",
+      model: "gpt-5.4",
+      channel: "qa",
+      trigger: "manual",
+      harnessId: "codex",
+      pluginId: "codex-plugin",
+      outcome: "completed",
+      resultClassification: "reasoning-only",
+      yieldDetected: true,
+      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
+      durationMs: expect.any(Number),
+    });
+  });
+
+  it("emits trusted harness error diagnostics with the failing lifecycle phase", async () => {
+    resetDiagnosticEventsForTest();
+    const params = createAttemptParams();
+    const sendError = new Error("codex app-server send failed");
+    const harness: AgentHarnessV2 = {
+      id: "codex",
+      label: "Codex",
+      supports: () => ({ supported: true }),
+      prepare: async () => ({
+        harnessId: "codex",
+        label: "Codex",
+        params,
+        lifecycleState: "prepared",
+      }),
+      start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
+      send: async () => {
+        throw sendError;
+      },
+      resolveOutcome: async (_session, rawResult) => rawResult,
+      cleanup: async () => {
+        throw new Error("cleanup failed");
+      },
+    };
+    const diagnostics = captureDiagnosticEvents();
+    try {
+      await expect(runAgentHarnessV2LifecycleAttempt(harness, params)).rejects.toThrow(
+        "codex app-server send failed",
+      );
+      await flushDiagnosticEvents();
+    } finally {
+      diagnostics.unsubscribe();
+    }
+
+    expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
+      "harness.run.started",
+      "harness.run.error",
+    ]);
+    expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
+    expect(diagnostics.events[1]?.event).toMatchObject({
+      type: "harness.run.error",
+      phase: "send",
+      errorCategory: "Error",
+      cleanupFailed: true,
+      harnessId: "codex",
+      durationMs: expect.any(Number),
+    });
+  });
+
  it("runs cleanup with the original failure and preserves that failure", async () => {
    const params = createAttemptParams();
    const sendError = new Error("codex app-server send failed");
--- a/src/agents/harness/v2.ts
+++ b/src/agents/harness/v2.ts
@@ -1,3 +1,10 @@
+import { diagnosticErrorCategory } from "../../infra/diagnostic-error-metadata.js";
+import {
+  emitTrustedDiagnosticEvent,
+  type DiagnosticHarnessRunErrorEvent,
+  type DiagnosticHarnessRunOutcome,
+} from "../../infra/diagnostic-events.js";
+import type { DiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js";
 import { formatErrorMessage } from "../../infra/errors.js";
 import { createSubsystemLogger } from "../../logging/subsystem.js";
 import { applyAgentHarnessResultClassification } from "./result-classification.js";
@@ -13,6 +20,7 @@ import type {
 } from "./types.js";

 const log = createSubsystemLogger("agents/harness/v2");
+type AgentHarnessV2LifecyclePhase = DiagnosticHarnessRunErrorEvent["phase"];

 type AgentHarnessV2RunBase = {
  harnessId: string;
@@ -95,6 +103,87 @@ export function adaptAgentHarnessToV2(harness: AgentHarness): AgentHarnessV2 {
  };
 }

+function agentHarnessDiagnosticBase(
+  harness: AgentHarnessV2,
+  params: AgentHarnessAttemptParams,
+  trace?: DiagnosticTraceContext,
+) {
+  return {
+    runId: params.runId,
+    sessionId: params.sessionId,
+    provider: params.provider,
+    model: params.modelId,
+    harnessId: harness.id,
+    ...(harness.pluginId ? { pluginId: harness.pluginId } : {}),
+    ...(params.sessionKey ? { sessionKey: params.sessionKey } : {}),
+    ...(params.trigger ? { trigger: params.trigger } : {}),
+    ...(params.messageChannel ? { channel: params.messageChannel } : {}),
+    ...(trace ? { trace } : {}),
+  };
+}
+
+function agentHarnessRunOutcome(result: AgentHarnessAttemptResult): DiagnosticHarnessRunOutcome {
+  if (result.promptError) {
+    return "error";
+  }
+  if (result.externalAbort || result.aborted) {
+    return "aborted";
+  }
+  if (result.timedOut || result.idleTimedOut || result.timedOutDuringCompaction) {
+    return "timed_out";
+  }
+  return "completed";
+}
+
+function emitAgentHarnessRunStarted(
+  harness: AgentHarnessV2,
+  params: AgentHarnessAttemptParams,
+): void {
+  emitTrustedDiagnosticEvent({
+    type: "harness.run.started",
+    ...agentHarnessDiagnosticBase(harness, params),
+  });
+}
+
+function emitAgentHarnessRunCompleted(params: {
+  harness: AgentHarnessV2;
+  attemptParams: AgentHarnessAttemptParams;
+  result: AgentHarnessAttemptResult;
+  startedAt: number;
+}): void {
+  const { harness, attemptParams, result, startedAt } = params;
+  emitTrustedDiagnosticEvent({
+    type: "harness.run.completed",
+    ...agentHarnessDiagnosticBase(harness, attemptParams, result.diagnosticTrace),
+    durationMs: Date.now() - startedAt,
+    outcome: agentHarnessRunOutcome(result),
+    ...(result.agentHarnessResultClassification
+      ? { resultClassification: result.agentHarnessResultClassification }
+      : {}),
+    ...(typeof result.yieldDetected === "boolean" ? { yieldDetected: result.yieldDetected } : {}),
+    itemLifecycle: { ...result.itemLifecycle },
+  });
+}
+
+function emitAgentHarnessRunError(params: {
+  harness: AgentHarnessV2;
+  attemptParams: AgentHarnessAttemptParams;
+  startedAt: number;
+  phase: AgentHarnessV2LifecyclePhase;
+  error: unknown;
+  cleanupFailed?: boolean;
+}): void {
+  const { harness, attemptParams, startedAt, phase, error, cleanupFailed } = params;
+  emitTrustedDiagnosticEvent({
+    type: "harness.run.error",
+    ...agentHarnessDiagnosticBase(harness, attemptParams),
+    durationMs: Date.now() - startedAt,
+    phase,
+    errorCategory: diagnosticErrorCategory(error),
+    ...(cleanupFailed ? { cleanupFailed: true } : {}),
+  });
+}
+
 export async function runAgentHarnessV2LifecycleAttempt(
  harness: AgentHarnessV2,
  params: AgentHarnessAttemptParams,
@@ -103,13 +192,21 @@ export async function runAgentHarnessV2LifecycleAttempt(
  let session: AgentHarnessV2Session | undefined;
  let rawResult: AgentHarnessAttemptResult | undefined;
  let result: AgentHarnessAttemptResult;
+  let phase: AgentHarnessV2LifecyclePhase = "prepare";
+  const startedAt = Date.now();

+  emitAgentHarnessRunStarted(harness, params);
  try {
+    phase = "prepare";
    prepared = await harness.prepare(params);
+    phase = "start";
    session = await harness.start(prepared);
+    phase = "send";
    rawResult = await harness.send(session);
+    phase = "resolve";
    result = await harness.resolveOutcome(session, rawResult);
  } catch (error) {
+    let cleanupFailed = false;
    try {
      await harness.cleanup({
        prepared,
@@ -118,6 +215,7 @@ export async function runAgentHarnessV2LifecycleAttempt(
        ...(rawResult === undefined ? {} : { result: rawResult }),
      });
    } catch (cleanupError) {
+      cleanupFailed = true;
      // Preserve the user-visible harness failure. Cleanup errors after a
      // failed lifecycle stage must not mask the actionable runtime error.
      log.warn("agent harness cleanup failed after attempt failure", {
@@ -128,9 +226,30 @@ export async function runAgentHarnessV2LifecycleAttempt(
        originalError: formatErrorMessage(error),
      });
    }
+    emitAgentHarnessRunError({
+      harness,
+      attemptParams: params,
+      startedAt,
+      phase,
+      error,
+      cleanupFailed,
+    });
    throw error;
  }

-  await harness.cleanup({ prepared, session, result });
+  try {
+    phase = "cleanup";
+    await harness.cleanup({ prepared, session, result });
+  } catch (error) {
+    emitAgentHarnessRunError({
+      harness,
+      attemptParams: params,
+      startedAt,
+      phase,
+      error,
+    });
+    throw error;
+  }
+  emitAgentHarnessRunCompleted({ harness, attemptParams: params, result, startedAt });
  return result;
 }
--- a/src/infra/diagnostic-events.ts
+++ b/src/infra/diagnostic-events.ts
@@ -256,6 +256,47 @@ export type DiagnosticRunCompletedEvent = DiagnosticRunBaseEvent & {
  errorCategory?: string;
 };

+export type DiagnosticHarnessRunPhase = "prepare" | "start" | "send" | "resolve" | "cleanup";
+export type DiagnosticHarnessRunOutcome = "completed" | "aborted" | "timed_out" | "error";
+
+type DiagnosticHarnessRunBaseEvent = DiagnosticBaseEvent & {
+  type: "harness.run.started" | "harness.run.completed" | "harness.run.error";
+  runId: string;
+  sessionKey?: string;
+  sessionId?: string;
+  provider?: string;
+  model?: string;
+  trigger?: string;
+  channel?: string;
+  harnessId: string;
+  pluginId?: string;
+};
+
+export type DiagnosticHarnessRunStartedEvent = DiagnosticHarnessRunBaseEvent & {
+  type: "harness.run.started";
+};
+
+export type DiagnosticHarnessRunCompletedEvent = DiagnosticHarnessRunBaseEvent & {
+  type: "harness.run.completed";
+  durationMs: number;
+  outcome: DiagnosticHarnessRunOutcome;
+  resultClassification?: "empty" | "reasoning-only" | "planning-only";
+  yieldDetected?: boolean;
+  itemLifecycle?: {
+    startedCount: number;
+    completedCount: number;
+    activeCount: number;
+  };
+};
+
+export type DiagnosticHarnessRunErrorEvent = DiagnosticHarnessRunBaseEvent & {
+  type: "harness.run.error";
+  durationMs: number;
+  phase: DiagnosticHarnessRunPhase;
+  errorCategory: string;
+  cleanupFailed?: boolean;
+};
+
 type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & {
  type: "model.call.started" | "model.call.completed" | "model.call.error";
  runId: string;
@@ -392,6 +433,9 @@ export type DiagnosticEventPayload =
  | DiagnosticExecProcessCompletedEvent
  | DiagnosticRunStartedEvent
  | DiagnosticRunCompletedEvent
+  | DiagnosticHarnessRunStartedEvent
+  | DiagnosticHarnessRunCompletedEvent
+  | DiagnosticHarnessRunErrorEvent
  | DiagnosticModelCallStartedEvent
  | DiagnosticModelCallCompletedEvent
  | DiagnosticModelCallErrorEvent
@@ -446,6 +490,9 @@ const ASYNC_DIAGNOSTIC_EVENT_TYPES = new Set<DiagnosticEventPayload["type"]>([
  "model.call.started",
  "model.call.completed",
  "model.call.error",
+  "harness.run.started",
+  "harness.run.completed",
+  "harness.run.error",
  "context.assembled",
  "log.record",
 ]);
--- a/src/logging/diagnostic-stability.ts
+++ b/src/logging/diagnostic-stability.ts
@@ -305,6 +305,34 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
      record.outcome = event.outcome;
      assignReasonCode(record, event.errorCategory);
      break;
+    case "harness.run.started":
+      record.source = event.harnessId;
+      record.pluginId = event.pluginId;
+      record.provider = event.provider;
+      record.model = event.model;
+      record.channel = event.channel;
+      break;
+    case "harness.run.completed":
+      record.source = event.harnessId;
+      record.pluginId = event.pluginId;
+      record.provider = event.provider;
+      record.model = event.model;
+      record.channel = event.channel;
+      record.durationMs = event.durationMs;
+      record.outcome = event.outcome;
+      record.count = event.itemLifecycle?.completedCount;
+      break;
+    case "harness.run.error":
+      record.source = event.harnessId;
+      record.pluginId = event.pluginId;
+      record.provider = event.provider;
+      record.model = event.model;
+      record.channel = event.channel;
+      record.durationMs = event.durationMs;
+      record.outcome = "error";
+      record.action = event.phase;
+      assignReasonCode(record, event.errorCategory);
+      break;
    case "model.call.started":
      record.provider = event.provider;
      record.model = event.model;