feat(diagnostics): add harness lifecycle telemetry

2026-05-06 07:30:43 +00:00 · 2026-04-25 23:34:03 -07:00
parent 8bbb143ab8
commit 82ddcf24f5
10 changed files with 516 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
 - Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc.
 - Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc.
 - Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
 - Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc.
 - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
 - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
 - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -59,9 +59,9 @@ pnpm qa:otel:smoke
 That script starts a local OTLP/HTTP trace receiver, runs the
 `otel-trace-smoke` QA scenario with the `diagnostics-otel` plugin enabled, then
 decodes the exported protobuf spans and asserts the release-critical shape:
-`openclaw.run`, `openclaw.model.call`, `openclaw.context.assembled`, and
+`openclaw.run`, `openclaw.harness.run`, `openclaw.model.call`,
-`openclaw.message.delivery` must be present; model calls must not export
+`openclaw.context.assembled`, and `openclaw.message.delivery` must be present;
-`StreamAbandoned` on successful turns; raw diagnostic IDs and
+model calls must not export `StreamAbandoned` on successful turns; raw diagnostic IDs and
 `openclaw.content.*` attributes must stay out of the trace. It writes
 `otel-smoke-summary.json` next to the QA suite artifacts.
--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -1140,6 +1140,28 @@ describe("diagnostics-otel service", () => {
        traceFlags: "01",
      },
    });
    emitDiagnosticEvent({
      type: "harness.run.completed",
      runId: "run-1",
      sessionKey: "session-key",
      sessionId: "session-1",
      provider: "codex",
      model: "gpt-5.4",
      channel: "qa",
      harnessId: "codex",
      pluginId: "codex-plugin",
      outcome: "completed",
      durationMs: 90,
      resultClassification: "reasoning-only",
      yieldDetected: true,
      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
      trace: {
        traceId: TRACE_ID,
        spanId: GRANDCHILD_SPAN_ID,
        parentSpanId: CHILD_SPAN_ID,
        traceFlags: "01",
      },
    });
    emitDiagnosticEvent({
      type: "tool.execution.error",
      runId: "run-1",
@@ -1160,7 +1182,12 @@ describe("diagnostics-otel service", () => {
    const spanNames = telemetryState.tracer.startSpan.mock.calls.map((call) => call[0]);
    expect(spanNames).toEqual(
-      expect.arrayContaining(["openclaw.run", "openclaw.model.call", "openclaw.tool.execution"]),
+      expect.arrayContaining([
        "openclaw.run",
        "openclaw.model.call",
        "openclaw.harness.run",
        "openclaw.tool.execution",
      ]),
    );
    const runCall = telemetryState.tracer.startSpan.mock.calls.find(
@@ -1207,6 +1234,36 @@ describe("diagnostics-otel service", () => {
    });
    expect(modelCall?.[2]).toBeUndefined();
    const harnessCall = telemetryState.tracer.startSpan.mock.calls.find(
      (call) => call[0] === "openclaw.harness.run",
    );
    expect(harnessCall?.[1]).toMatchObject({
      attributes: {
        "openclaw.harness.id": "codex",
        "openclaw.harness.plugin": "codex-plugin",
        "openclaw.outcome": "completed",
        "openclaw.provider": "codex",
        "openclaw.model": "gpt-5.4",
        "openclaw.channel": "qa",
        "openclaw.harness.result_classification": "reasoning-only",
        "openclaw.harness.yield_detected": true,
        "openclaw.harness.items.started": 3,
        "openclaw.harness.items.completed": 2,
        "openclaw.harness.items.active": 1,
      },
      startTime: expect.any(Number),
    });
    expect(harnessCall?.[1]).toEqual({
      attributes: expect.not.objectContaining({
        "openclaw.runId": expect.anything(),
        "openclaw.sessionId": expect.anything(),
        "openclaw.sessionKey": expect.anything(),
        "openclaw.traceId": expect.anything(),
      }),
      startTime: expect.any(Number),
    });
    expect(harnessCall?.[2]).toBeUndefined();
    const toolCall = telemetryState.tracer.startSpan.mock.calls.find(
      (call) => call[0] === "openclaw.tool.execution",
    );
@@ -1244,6 +1301,25 @@ describe("diagnostics-otel service", () => {
        "openclaw.runId": expect.anything(),
      }),
    );
    expect(
      telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
    ).toHaveBeenCalledWith(
      90,
      expect.objectContaining({
        "openclaw.harness.id": "codex",
        "openclaw.harness.plugin": "codex-plugin",
        "openclaw.outcome": "completed",
      }),
    );
    expect(
      telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
    ).toHaveBeenCalledWith(
      90,
      expect.not.objectContaining({
        "openclaw.runId": expect.anything(),
        "openclaw.sessionKey": expect.anything(),
      }),
    );
    expect(
      telemetryState.histograms.get("openclaw.tool.execution.duration_ms")?.record,
    ).toHaveBeenCalledWith(
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -81,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
  DiagnosticEventPayload,
  { type: "model.call.completed" | "model.call.error" }
 >;
 type HarnessRunLifecycleDiagnosticEvent = Extract<
  DiagnosticEventPayload,
  { type: "harness.run.completed" | "harness.run.error" }
 >;
 type TelemetryExporterDiagnosticEvent = Extract<
  DiagnosticEventPayload,
  { type: "telemetry.exporter" }
@@ -720,6 +724,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        unit: "ms",
        description: "Agent run duration",
      });
      const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
        unit: "ms",
        description: "Agent harness lifecycle duration",
      });
      const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
        unit: "1",
        description: "Context window size and usage",
@@ -1426,6 +1434,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
        span.end(evt.ts);
      };
      const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({
        "openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
        "openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
        "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
        "openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
        "openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
        ...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
      });
      const recordHarnessRunCompleted = (
        evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
        metadata: DiagnosticEventMetadata,
      ) => {
        harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
        if (!tracesEnabled) {
          return;
        }
        const spanAttrs: Record<string, string | number | boolean> = {
          ...harnessRunMetricAttrs(evt),
        };
        if (evt.resultClassification) {
          spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
            evt.resultClassification,
          );
        }
        if (typeof evt.yieldDetected === "boolean") {
          spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
        }
        if (evt.itemLifecycle) {
          spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
          spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
          spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
        }
        const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
          parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
          endTimeMs: evt.ts,
        });
        if (evt.outcome === "error") {
          span.setStatus({
            code: SpanStatusCode.ERROR,
            message: "error",
          });
        }
        span.end(evt.ts);
      };
      const recordHarnessRunError = (
        evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
        metadata: DiagnosticEventMetadata,
      ) => {
        const errorType = lowCardinalityAttr(evt.errorCategory, "other");
        const attrs = {
          ...harnessRunMetricAttrs(evt),
          "openclaw.harness.phase": evt.phase,
          "openclaw.errorCategory": errorType,
        };
        harnessDurationHistogram.record(evt.durationMs, attrs);
        if (!tracesEnabled) {
          return;
        }
        const spanAttrs: Record<string, string | number | boolean> = {
          ...attrs,
          "error.type": errorType,
          ...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
        };
        const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
          parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
          endTimeMs: evt.ts,
        });
        span.setStatus({
          code: SpanStatusCode.ERROR,
          message: errorType,
        });
        span.end(evt.ts);
      };
      const recordContextAssembled = (
        evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
        metadata: DiagnosticEventMetadata,
@@ -1746,6 +1830,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
            case "run.completed":
              recordRunCompleted(evt, metadata);
              return;
            case "harness.run.completed":
              recordHarnessRunCompleted(evt, metadata);
              return;
            case "harness.run.error":
              recordHarnessRunError(evt, metadata);
              return;
            case "context.assembled":
              recordContextAssembled(evt, metadata);
              return;
@@ -1781,6 +1871,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
              return;
            case "tool.execution.started":
            case "run.started":
            case "harness.run.started":
            case "model.call.started":
            case "payload.large":
              return;
--- a/qa/scenarios/runtime/otel-trace-smoke.md
+++ b/qa/scenarios/runtime/otel-trace-smoke.md
@@ -13,6 +13,7 @@ objective: Verify a QA-lab gateway run emits bounded OpenTelemetry trace spans t
 successCriteria:
  - The diagnostics-otel plugin starts with trace export enabled.
  - A minimal QA-channel agent turn completes.
  - The trace includes the selected agent harness lifecycle span.
  - The run emits low-cardinality OpenTelemetry trace spans without content or raw diagnostic identifiers.
 plugins:
  - diagnostics-otel
@@ -33,6 +34,7 @@ docsRefs:
  - docs/concepts/qa-e2e-automation.md
 codeRefs:
  - extensions/diagnostics-otel/src/service.ts
  - src/agents/harness/v2.ts
  - extensions/qa-lab/src/suite.ts
 execution:
  kind: flow
--- a/scripts/qa-otel-smoke.ts
+++ b/scripts/qa-otel-smoke.ts
@@ -80,6 +80,7 @@ type CapturedSpan = {
 const DEFAULT_SCENARIO_ID = "otel-trace-smoke";
 const REQUIRED_SPAN_NAMES = [
  "openclaw.run",
  "openclaw.harness.run",
  "openclaw.model.call",
  "openclaw.context.assembled",
  "openclaw.message.delivery",
--- a/src/agents/harness/v2.test.ts
+++ b/src/agents/harness/v2.test.ts
@@ -1,5 +1,11 @@
 import type { Api, Model } from "@mariozechner/pi-ai";
-import { describe, expect, it, vi } from "vitest";
+import { afterEach, describe, expect, it, vi } from "vitest";
 import {
  onInternalDiagnosticEvent,
  resetDiagnosticEventsForTest,
  type DiagnosticEventMetadata,
  type DiagnosticEventPayload,
 } from "../../infra/diagnostic-events.js";
 import type { EmbeddedRunAttemptResult } from "../pi-embedded-runner/run/types.js";
 import type { AgentHarness, AgentHarnessAttemptParams } from "./types.js";
 import type { AgentHarnessV2 } from "./v2.js";
@@ -9,6 +15,7 @@ function createAttemptParams(): AgentHarnessAttemptParams {
  return {
    prompt: "hello",
    sessionId: "session-1",
    sessionKey: "session-key",
    runId: "run-1",
    sessionFile: "/tmp/session.jsonl",
    workspaceDir: "/tmp/workspace",
@@ -19,9 +26,19 @@ function createAttemptParams(): AgentHarnessAttemptParams {
    authStorage: {} as never,
    modelRegistry: {} as never,
    thinkLevel: "low",
    messageChannel: "qa",
    trigger: "manual",
  } as AgentHarnessAttemptParams;
 }
 function createDiagnosticTrace() {
  return {
    traceId: "11111111111111111111111111111111",
    spanId: "2222222222222222",
    traceFlags: "01",
  };
 }
 function createAttemptResult(): EmbeddedRunAttemptResult {
  return {
    aborted: false,
@@ -32,6 +49,7 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
    promptError: null,
    promptErrorSource: null,
    sessionIdUsed: "session-1",
    diagnosticTrace: createDiagnosticTrace(),
    messagesSnapshot: [],
    assistantTexts: ["ok"],
    toolMetas: [],
@@ -46,7 +64,28 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
  };
 }
 async function flushDiagnosticEvents(): Promise<void> {
  await new Promise<void>((resolve) => setImmediate(resolve));
 }
 function captureDiagnosticEvents(): {
  events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }>;
  unsubscribe: () => void;
 } {
  const events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }> = [];
  const unsubscribe = onInternalDiagnosticEvent((event, metadata) => {
    if (event.type.startsWith("harness.run.")) {
      events.push({ event, metadata });
    }
  });
  return { events, unsubscribe };
 }
 describe("AgentHarness V2 compatibility adapter", () => {
  afterEach(() => {
    resetDiagnosticEventsForTest();
  });
  it("executes prepare/start/send/outcome/cleanup as one bounded lifecycle", async () => {
    const params = createAttemptParams();
    const result = createAttemptResult();
@@ -102,6 +141,112 @@ describe("AgentHarness V2 compatibility adapter", () => {
    ]);
  });
  it("emits trusted harness lifecycle diagnostics for successful attempts", async () => {
    resetDiagnosticEventsForTest();
    const params = createAttemptParams();
    const result = {
      ...createAttemptResult(),
      agentHarnessResultClassification: "reasoning-only",
      yieldDetected: true,
      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
    } as EmbeddedRunAttemptResult;
    const harness: AgentHarnessV2 = {
      id: "codex",
      label: "Codex",
      pluginId: "codex-plugin",
      supports: () => ({ supported: true }),
      prepare: async () => ({
        harnessId: "codex",
        label: "Codex",
        pluginId: "codex-plugin",
        params,
        lifecycleState: "prepared",
      }),
      start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
      send: async () => result,
      resolveOutcome: async (_session, rawResult) => rawResult,
      cleanup: async () => {},
    };
    const diagnostics = captureDiagnosticEvents();
    try {
      await runAgentHarnessV2LifecycleAttempt(harness, params);
      await flushDiagnosticEvents();
    } finally {
      diagnostics.unsubscribe();
    }
    expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
      "harness.run.started",
      "harness.run.completed",
    ]);
    expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
    expect(diagnostics.events[1]?.event).toMatchObject({
      type: "harness.run.completed",
      runId: "run-1",
      sessionKey: "session-key",
      sessionId: "session-1",
      provider: "codex",
      model: "gpt-5.4",
      channel: "qa",
      trigger: "manual",
      harnessId: "codex",
      pluginId: "codex-plugin",
      outcome: "completed",
      resultClassification: "reasoning-only",
      yieldDetected: true,
      itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
      durationMs: expect.any(Number),
    });
  });
  it("emits trusted harness error diagnostics with the failing lifecycle phase", async () => {
    resetDiagnosticEventsForTest();
    const params = createAttemptParams();
    const sendError = new Error("codex app-server send failed");
    const harness: AgentHarnessV2 = {
      id: "codex",
      label: "Codex",
      supports: () => ({ supported: true }),
      prepare: async () => ({
        harnessId: "codex",
        label: "Codex",
        params,
        lifecycleState: "prepared",
      }),
      start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
      send: async () => {
        throw sendError;
      },
      resolveOutcome: async (_session, rawResult) => rawResult,
      cleanup: async () => {
        throw new Error("cleanup failed");
      },
    };
    const diagnostics = captureDiagnosticEvents();
    try {
      await expect(runAgentHarnessV2LifecycleAttempt(harness, params)).rejects.toThrow(
        "codex app-server send failed",
      );
      await flushDiagnosticEvents();
    } finally {
      diagnostics.unsubscribe();
    }
    expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
      "harness.run.started",
      "harness.run.error",
    ]);
    expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
    expect(diagnostics.events[1]?.event).toMatchObject({
      type: "harness.run.error",
      phase: "send",
      errorCategory: "Error",
      cleanupFailed: true,
      harnessId: "codex",
      durationMs: expect.any(Number),
    });
  });
  it("runs cleanup with the original failure and preserves that failure", async () => {
    const params = createAttemptParams();
    const sendError = new Error("codex app-server send failed");
--- a/src/agents/harness/v2.ts
+++ b/src/agents/harness/v2.ts
@@ -1,3 +1,10 @@
 import { diagnosticErrorCategory } from "../../infra/diagnostic-error-metadata.js";
 import {
  emitTrustedDiagnosticEvent,
  type DiagnosticHarnessRunErrorEvent,
  type DiagnosticHarnessRunOutcome,
 } from "../../infra/diagnostic-events.js";
 import type { DiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js";
 import { formatErrorMessage } from "../../infra/errors.js";
 import { createSubsystemLogger } from "../../logging/subsystem.js";
 import { applyAgentHarnessResultClassification } from "./result-classification.js";
@@ -13,6 +20,7 @@ import type {
 } from "./types.js";
 const log = createSubsystemLogger("agents/harness/v2");
 type AgentHarnessV2LifecyclePhase = DiagnosticHarnessRunErrorEvent["phase"];
 type AgentHarnessV2RunBase = {
  harnessId: string;
@@ -95,6 +103,87 @@ export function adaptAgentHarnessToV2(harness: AgentHarness): AgentHarnessV2 {
  };
 }
 function agentHarnessDiagnosticBase(
  harness: AgentHarnessV2,
  params: AgentHarnessAttemptParams,
  trace?: DiagnosticTraceContext,
 ) {
  return {
    runId: params.runId,
    sessionId: params.sessionId,
    provider: params.provider,
    model: params.modelId,
    harnessId: harness.id,
    ...(harness.pluginId ? { pluginId: harness.pluginId } : {}),
    ...(params.sessionKey ? { sessionKey: params.sessionKey } : {}),
    ...(params.trigger ? { trigger: params.trigger } : {}),
    ...(params.messageChannel ? { channel: params.messageChannel } : {}),
    ...(trace ? { trace } : {}),
  };
 }
 function agentHarnessRunOutcome(result: AgentHarnessAttemptResult): DiagnosticHarnessRunOutcome {
  if (result.promptError) {
    return "error";
  }
  if (result.externalAbort || result.aborted) {
    return "aborted";
  }
  if (result.timedOut || result.idleTimedOut || result.timedOutDuringCompaction) {
    return "timed_out";
  }
  return "completed";
 }
 function emitAgentHarnessRunStarted(
  harness: AgentHarnessV2,
  params: AgentHarnessAttemptParams,
 ): void {
  emitTrustedDiagnosticEvent({
    type: "harness.run.started",
    ...agentHarnessDiagnosticBase(harness, params),
  });
 }
 function emitAgentHarnessRunCompleted(params: {
  harness: AgentHarnessV2;
  attemptParams: AgentHarnessAttemptParams;
  result: AgentHarnessAttemptResult;
  startedAt: number;
 }): void {
  const { harness, attemptParams, result, startedAt } = params;
  emitTrustedDiagnosticEvent({
    type: "harness.run.completed",
    ...agentHarnessDiagnosticBase(harness, attemptParams, result.diagnosticTrace),
    durationMs: Date.now() - startedAt,
    outcome: agentHarnessRunOutcome(result),
    ...(result.agentHarnessResultClassification
      ? { resultClassification: result.agentHarnessResultClassification }
      : {}),
    ...(typeof result.yieldDetected === "boolean" ? { yieldDetected: result.yieldDetected } : {}),
    itemLifecycle: { ...result.itemLifecycle },
  });
 }
 function emitAgentHarnessRunError(params: {
  harness: AgentHarnessV2;
  attemptParams: AgentHarnessAttemptParams;
  startedAt: number;
  phase: AgentHarnessV2LifecyclePhase;
  error: unknown;
  cleanupFailed?: boolean;
 }): void {
  const { harness, attemptParams, startedAt, phase, error, cleanupFailed } = params;
  emitTrustedDiagnosticEvent({
    type: "harness.run.error",
    ...agentHarnessDiagnosticBase(harness, attemptParams),
    durationMs: Date.now() - startedAt,
    phase,
    errorCategory: diagnosticErrorCategory(error),
    ...(cleanupFailed ? { cleanupFailed: true } : {}),
  });
 }
 export async function runAgentHarnessV2LifecycleAttempt(
  harness: AgentHarnessV2,
  params: AgentHarnessAttemptParams,
@@ -103,13 +192,21 @@ export async function runAgentHarnessV2LifecycleAttempt(
  let session: AgentHarnessV2Session | undefined;
  let rawResult: AgentHarnessAttemptResult | undefined;
  let result: AgentHarnessAttemptResult;
  let phase: AgentHarnessV2LifecyclePhase = "prepare";
  const startedAt = Date.now();
  emitAgentHarnessRunStarted(harness, params);
  try {
    phase = "prepare";
    prepared = await harness.prepare(params);
    phase = "start";
    session = await harness.start(prepared);
    phase = "send";
    rawResult = await harness.send(session);
    phase = "resolve";
    result = await harness.resolveOutcome(session, rawResult);
  } catch (error) {
    let cleanupFailed = false;
    try {
      await harness.cleanup({
        prepared,
@@ -118,6 +215,7 @@ export async function runAgentHarnessV2LifecycleAttempt(
        ...(rawResult === undefined ? {} : { result: rawResult }),
      });
    } catch (cleanupError) {
      cleanupFailed = true;
      // Preserve the user-visible harness failure. Cleanup errors after a
      // failed lifecycle stage must not mask the actionable runtime error.
      log.warn("agent harness cleanup failed after attempt failure", {
@@ -128,9 +226,30 @@ export async function runAgentHarnessV2LifecycleAttempt(
        originalError: formatErrorMessage(error),
      });
    }
    emitAgentHarnessRunError({
      harness,
      attemptParams: params,
      startedAt,
      phase,
      error,
      cleanupFailed,
    });
    throw error;
  }
-  await harness.cleanup({ prepared, session, result });
+  try {
    phase = "cleanup";
    await harness.cleanup({ prepared, session, result });
  } catch (error) {
    emitAgentHarnessRunError({
      harness,
      attemptParams: params,
      startedAt,
      phase,
      error,
    });
    throw error;
  }
  emitAgentHarnessRunCompleted({ harness, attemptParams: params, result, startedAt });
  return result;
 }
--- a/src/infra/diagnostic-events.ts
+++ b/src/infra/diagnostic-events.ts
@@ -256,6 +256,47 @@ export type DiagnosticRunCompletedEvent = DiagnosticRunBaseEvent & {
  errorCategory?: string;
 };
 export type DiagnosticHarnessRunPhase = "prepare" | "start" | "send" | "resolve" | "cleanup";
 export type DiagnosticHarnessRunOutcome = "completed" | "aborted" | "timed_out" | "error";
 type DiagnosticHarnessRunBaseEvent = DiagnosticBaseEvent & {
  type: "harness.run.started" | "harness.run.completed" | "harness.run.error";
  runId: string;
  sessionKey?: string;
  sessionId?: string;
  provider?: string;
  model?: string;
  trigger?: string;
  channel?: string;
  harnessId: string;
  pluginId?: string;
 };
 export type DiagnosticHarnessRunStartedEvent = DiagnosticHarnessRunBaseEvent & {
  type: "harness.run.started";
 };
 export type DiagnosticHarnessRunCompletedEvent = DiagnosticHarnessRunBaseEvent & {
  type: "harness.run.completed";
  durationMs: number;
  outcome: DiagnosticHarnessRunOutcome;
  resultClassification?: "empty" | "reasoning-only" | "planning-only";
  yieldDetected?: boolean;
  itemLifecycle?: {
    startedCount: number;
    completedCount: number;
    activeCount: number;
  };
 };
 export type DiagnosticHarnessRunErrorEvent = DiagnosticHarnessRunBaseEvent & {
  type: "harness.run.error";
  durationMs: number;
  phase: DiagnosticHarnessRunPhase;
  errorCategory: string;
  cleanupFailed?: boolean;
 };
 type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & {
  type: "model.call.started" | "model.call.completed" | "model.call.error";
  runId: string;
@@ -392,6 +433,9 @@ export type DiagnosticEventPayload =
  | DiagnosticExecProcessCompletedEvent
  | DiagnosticRunStartedEvent
  | DiagnosticRunCompletedEvent
  | DiagnosticHarnessRunStartedEvent
  | DiagnosticHarnessRunCompletedEvent
  | DiagnosticHarnessRunErrorEvent
  | DiagnosticModelCallStartedEvent
  | DiagnosticModelCallCompletedEvent
  | DiagnosticModelCallErrorEvent
@@ -446,6 +490,9 @@ const ASYNC_DIAGNOSTIC_EVENT_TYPES = new Set<DiagnosticEventPayload["type"]>([
  "model.call.started",
  "model.call.completed",
  "model.call.error",
  "harness.run.started",
  "harness.run.completed",
  "harness.run.error",
  "context.assembled",
  "log.record",
 ]);
--- a/src/logging/diagnostic-stability.ts
+++ b/src/logging/diagnostic-stability.ts
@@ -305,6 +305,34 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
      record.outcome = event.outcome;
      assignReasonCode(record, event.errorCategory);
      break;
    case "harness.run.started":
      record.source = event.harnessId;
      record.pluginId = event.pluginId;
      record.provider = event.provider;
      record.model = event.model;
      record.channel = event.channel;
      break;
    case "harness.run.completed":
      record.source = event.harnessId;
      record.pluginId = event.pluginId;
      record.provider = event.provider;
      record.model = event.model;
      record.channel = event.channel;
      record.durationMs = event.durationMs;
      record.outcome = event.outcome;
      record.count = event.itemLifecycle?.completedCount;
      break;
    case "harness.run.error":
      record.source = event.harnessId;
      record.pluginId = event.pluginId;
      record.provider = event.provider;
      record.model = event.model;
      record.channel = event.channel;
      record.durationMs = event.durationMs;
      record.outcome = "error";
      record.action = event.phase;
      assignReasonCode(record, event.errorCategory);
      break;
    case "model.call.started":
      record.provider = event.provider;
      record.model = event.model;