From 82ddcf24f5875224d2afe559147a0a621ce8c383 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 23:34:03 -0700 Subject: [PATCH] feat(diagnostics): add harness lifecycle telemetry --- CHANGELOG.md | 1 + docs/concepts/qa-e2e-automation.md | 6 +- .../diagnostics-otel/src/service.test.ts | 78 +++++++++- extensions/diagnostics-otel/src/service.ts | 91 +++++++++++ qa/scenarios/runtime/otel-trace-smoke.md | 2 + scripts/qa-otel-smoke.ts | 1 + src/agents/harness/v2.test.ts | 147 +++++++++++++++++- src/agents/harness/v2.ts | 121 +++++++++++++- src/infra/diagnostic-events.ts | 47 ++++++ src/logging/diagnostic-stability.ts | 28 ++++ 10 files changed, 516 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4512e18e12f..7d14dff836d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai - Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc. - Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc. - Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc. +- Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc. - Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc. - Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc. - Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc. diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 272d16a185a..d56e546b6f1 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -59,9 +59,9 @@ pnpm qa:otel:smoke That script starts a local OTLP/HTTP trace receiver, runs the `otel-trace-smoke` QA scenario with the `diagnostics-otel` plugin enabled, then decodes the exported protobuf spans and asserts the release-critical shape: -`openclaw.run`, `openclaw.model.call`, `openclaw.context.assembled`, and -`openclaw.message.delivery` must be present; model calls must not export -`StreamAbandoned` on successful turns; raw diagnostic IDs and +`openclaw.run`, `openclaw.harness.run`, `openclaw.model.call`, +`openclaw.context.assembled`, and `openclaw.message.delivery` must be present; +model calls must not export `StreamAbandoned` on successful turns; raw diagnostic IDs and `openclaw.content.*` attributes must stay out of the trace. It writes `otel-smoke-summary.json` next to the QA suite artifacts. diff --git a/extensions/diagnostics-otel/src/service.test.ts b/extensions/diagnostics-otel/src/service.test.ts index ff88364d756..0cf3b373e71 100644 --- a/extensions/diagnostics-otel/src/service.test.ts +++ b/extensions/diagnostics-otel/src/service.test.ts @@ -1140,6 +1140,28 @@ describe("diagnostics-otel service", () => { traceFlags: "01", }, }); + emitDiagnosticEvent({ + type: "harness.run.completed", + runId: "run-1", + sessionKey: "session-key", + sessionId: "session-1", + provider: "codex", + model: "gpt-5.4", + channel: "qa", + harnessId: "codex", + pluginId: "codex-plugin", + outcome: "completed", + durationMs: 90, + resultClassification: "reasoning-only", + yieldDetected: true, + itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 }, + trace: { + traceId: TRACE_ID, + spanId: GRANDCHILD_SPAN_ID, + parentSpanId: CHILD_SPAN_ID, + traceFlags: "01", + }, + }); emitDiagnosticEvent({ type: "tool.execution.error", runId: "run-1", @@ -1160,7 +1182,12 @@ describe("diagnostics-otel service", () => { const spanNames = telemetryState.tracer.startSpan.mock.calls.map((call) => call[0]); expect(spanNames).toEqual( - expect.arrayContaining(["openclaw.run", "openclaw.model.call", "openclaw.tool.execution"]), + expect.arrayContaining([ + "openclaw.run", + "openclaw.model.call", + "openclaw.harness.run", + "openclaw.tool.execution", + ]), ); const runCall = telemetryState.tracer.startSpan.mock.calls.find( @@ -1207,6 +1234,36 @@ describe("diagnostics-otel service", () => { }); expect(modelCall?.[2]).toBeUndefined(); + const harnessCall = telemetryState.tracer.startSpan.mock.calls.find( + (call) => call[0] === "openclaw.harness.run", + ); + expect(harnessCall?.[1]).toMatchObject({ + attributes: { + "openclaw.harness.id": "codex", + "openclaw.harness.plugin": "codex-plugin", + "openclaw.outcome": "completed", + "openclaw.provider": "codex", + "openclaw.model": "gpt-5.4", + "openclaw.channel": "qa", + "openclaw.harness.result_classification": "reasoning-only", + "openclaw.harness.yield_detected": true, + "openclaw.harness.items.started": 3, + "openclaw.harness.items.completed": 2, + "openclaw.harness.items.active": 1, + }, + startTime: expect.any(Number), + }); + expect(harnessCall?.[1]).toEqual({ + attributes: expect.not.objectContaining({ + "openclaw.runId": expect.anything(), + "openclaw.sessionId": expect.anything(), + "openclaw.sessionKey": expect.anything(), + "openclaw.traceId": expect.anything(), + }), + startTime: expect.any(Number), + }); + expect(harnessCall?.[2]).toBeUndefined(); + const toolCall = telemetryState.tracer.startSpan.mock.calls.find( (call) => call[0] === "openclaw.tool.execution", ); @@ -1244,6 +1301,25 @@ describe("diagnostics-otel service", () => { "openclaw.runId": expect.anything(), }), ); + expect( + telemetryState.histograms.get("openclaw.harness.duration_ms")?.record, + ).toHaveBeenCalledWith( + 90, + expect.objectContaining({ + "openclaw.harness.id": "codex", + "openclaw.harness.plugin": "codex-plugin", + "openclaw.outcome": "completed", + }), + ); + expect( + telemetryState.histograms.get("openclaw.harness.duration_ms")?.record, + ).toHaveBeenCalledWith( + 90, + expect.not.objectContaining({ + "openclaw.runId": expect.anything(), + "openclaw.sessionKey": expect.anything(), + }), + ); expect( telemetryState.histograms.get("openclaw.tool.execution.duration_ms")?.record, ).toHaveBeenCalledWith( diff --git a/extensions/diagnostics-otel/src/service.ts b/extensions/diagnostics-otel/src/service.ts index 1fa67c694dc..c34ab6f1c6c 100644 --- a/extensions/diagnostics-otel/src/service.ts +++ b/extensions/diagnostics-otel/src/service.ts @@ -81,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract< DiagnosticEventPayload, { type: "model.call.completed" | "model.call.error" } >; +type HarnessRunLifecycleDiagnosticEvent = Extract< + DiagnosticEventPayload, + { type: "harness.run.completed" | "harness.run.error" } +>; type TelemetryExporterDiagnosticEvent = Extract< DiagnosticEventPayload, { type: "telemetry.exporter" } @@ -720,6 +724,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { unit: "ms", description: "Agent run duration", }); + const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", { + unit: "ms", + description: "Agent harness lifecycle duration", + }); const contextHistogram = meter.createHistogram("openclaw.context.tokens", { unit: "1", description: "Context window size and usage", @@ -1426,6 +1434,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { span.end(evt.ts); }; + const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({ + "openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"), + "openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId), + "openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome, + "openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"), + "openclaw.model": lowCardinalityAttr(evt.model, "unknown"), + ...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}), + }); + + const recordHarnessRunCompleted = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt)); + if (!tracesEnabled) { + return; + } + const spanAttrs: Record = { + ...harnessRunMetricAttrs(evt), + }; + if (evt.resultClassification) { + spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr( + evt.resultClassification, + ); + } + if (typeof evt.yieldDetected === "boolean") { + spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected; + } + if (evt.itemLifecycle) { + spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount; + spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount; + spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount; + } + const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { + parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), + endTimeMs: evt.ts, + }); + if (evt.outcome === "error") { + span.setStatus({ + code: SpanStatusCode.ERROR, + message: "error", + }); + } + span.end(evt.ts); + }; + + const recordHarnessRunError = ( + evt: Extract, + metadata: DiagnosticEventMetadata, + ) => { + const errorType = lowCardinalityAttr(evt.errorCategory, "other"); + const attrs = { + ...harnessRunMetricAttrs(evt), + "openclaw.harness.phase": evt.phase, + "openclaw.errorCategory": errorType, + }; + harnessDurationHistogram.record(evt.durationMs, attrs); + if (!tracesEnabled) { + return; + } + const spanAttrs: Record = { + ...attrs, + "error.type": errorType, + ...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}), + }; + const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, { + parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata), + endTimeMs: evt.ts, + }); + span.setStatus({ + code: SpanStatusCode.ERROR, + message: errorType, + }); + span.end(evt.ts); + }; + const recordContextAssembled = ( evt: Extract, metadata: DiagnosticEventMetadata, @@ -1746,6 +1830,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { case "run.completed": recordRunCompleted(evt, metadata); return; + case "harness.run.completed": + recordHarnessRunCompleted(evt, metadata); + return; + case "harness.run.error": + recordHarnessRunError(evt, metadata); + return; case "context.assembled": recordContextAssembled(evt, metadata); return; @@ -1781,6 +1871,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService { return; case "tool.execution.started": case "run.started": + case "harness.run.started": case "model.call.started": case "payload.large": return; diff --git a/qa/scenarios/runtime/otel-trace-smoke.md b/qa/scenarios/runtime/otel-trace-smoke.md index bafd3b39f86..f4a94e70b3f 100644 --- a/qa/scenarios/runtime/otel-trace-smoke.md +++ b/qa/scenarios/runtime/otel-trace-smoke.md @@ -13,6 +13,7 @@ objective: Verify a QA-lab gateway run emits bounded OpenTelemetry trace spans t successCriteria: - The diagnostics-otel plugin starts with trace export enabled. - A minimal QA-channel agent turn completes. + - The trace includes the selected agent harness lifecycle span. - The run emits low-cardinality OpenTelemetry trace spans without content or raw diagnostic identifiers. plugins: - diagnostics-otel @@ -33,6 +34,7 @@ docsRefs: - docs/concepts/qa-e2e-automation.md codeRefs: - extensions/diagnostics-otel/src/service.ts + - src/agents/harness/v2.ts - extensions/qa-lab/src/suite.ts execution: kind: flow diff --git a/scripts/qa-otel-smoke.ts b/scripts/qa-otel-smoke.ts index d5444ba9562..a1660e8fedc 100644 --- a/scripts/qa-otel-smoke.ts +++ b/scripts/qa-otel-smoke.ts @@ -80,6 +80,7 @@ type CapturedSpan = { const DEFAULT_SCENARIO_ID = "otel-trace-smoke"; const REQUIRED_SPAN_NAMES = [ "openclaw.run", + "openclaw.harness.run", "openclaw.model.call", "openclaw.context.assembled", "openclaw.message.delivery", diff --git a/src/agents/harness/v2.test.ts b/src/agents/harness/v2.test.ts index dc6f196715c..e8ea68a9897 100644 --- a/src/agents/harness/v2.test.ts +++ b/src/agents/harness/v2.test.ts @@ -1,5 +1,11 @@ import type { Api, Model } from "@mariozechner/pi-ai"; -import { describe, expect, it, vi } from "vitest"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { + onInternalDiagnosticEvent, + resetDiagnosticEventsForTest, + type DiagnosticEventMetadata, + type DiagnosticEventPayload, +} from "../../infra/diagnostic-events.js"; import type { EmbeddedRunAttemptResult } from "../pi-embedded-runner/run/types.js"; import type { AgentHarness, AgentHarnessAttemptParams } from "./types.js"; import type { AgentHarnessV2 } from "./v2.js"; @@ -9,6 +15,7 @@ function createAttemptParams(): AgentHarnessAttemptParams { return { prompt: "hello", sessionId: "session-1", + sessionKey: "session-key", runId: "run-1", sessionFile: "/tmp/session.jsonl", workspaceDir: "/tmp/workspace", @@ -19,9 +26,19 @@ function createAttemptParams(): AgentHarnessAttemptParams { authStorage: {} as never, modelRegistry: {} as never, thinkLevel: "low", + messageChannel: "qa", + trigger: "manual", } as AgentHarnessAttemptParams; } +function createDiagnosticTrace() { + return { + traceId: "11111111111111111111111111111111", + spanId: "2222222222222222", + traceFlags: "01", + }; +} + function createAttemptResult(): EmbeddedRunAttemptResult { return { aborted: false, @@ -32,6 +49,7 @@ function createAttemptResult(): EmbeddedRunAttemptResult { promptError: null, promptErrorSource: null, sessionIdUsed: "session-1", + diagnosticTrace: createDiagnosticTrace(), messagesSnapshot: [], assistantTexts: ["ok"], toolMetas: [], @@ -46,7 +64,28 @@ function createAttemptResult(): EmbeddedRunAttemptResult { }; } +async function flushDiagnosticEvents(): Promise { + await new Promise((resolve) => setImmediate(resolve)); +} + +function captureDiagnosticEvents(): { + events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }>; + unsubscribe: () => void; +} { + const events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }> = []; + const unsubscribe = onInternalDiagnosticEvent((event, metadata) => { + if (event.type.startsWith("harness.run.")) { + events.push({ event, metadata }); + } + }); + return { events, unsubscribe }; +} + describe("AgentHarness V2 compatibility adapter", () => { + afterEach(() => { + resetDiagnosticEventsForTest(); + }); + it("executes prepare/start/send/outcome/cleanup as one bounded lifecycle", async () => { const params = createAttemptParams(); const result = createAttemptResult(); @@ -102,6 +141,112 @@ describe("AgentHarness V2 compatibility adapter", () => { ]); }); + it("emits trusted harness lifecycle diagnostics for successful attempts", async () => { + resetDiagnosticEventsForTest(); + const params = createAttemptParams(); + const result = { + ...createAttemptResult(), + agentHarnessResultClassification: "reasoning-only", + yieldDetected: true, + itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 }, + } as EmbeddedRunAttemptResult; + const harness: AgentHarnessV2 = { + id: "codex", + label: "Codex", + pluginId: "codex-plugin", + supports: () => ({ supported: true }), + prepare: async () => ({ + harnessId: "codex", + label: "Codex", + pluginId: "codex-plugin", + params, + lifecycleState: "prepared", + }), + start: async (prepared) => ({ ...prepared, lifecycleState: "started" }), + send: async () => result, + resolveOutcome: async (_session, rawResult) => rawResult, + cleanup: async () => {}, + }; + const diagnostics = captureDiagnosticEvents(); + try { + await runAgentHarnessV2LifecycleAttempt(harness, params); + await flushDiagnosticEvents(); + } finally { + diagnostics.unsubscribe(); + } + + expect(diagnostics.events.map(({ event }) => event.type)).toEqual([ + "harness.run.started", + "harness.run.completed", + ]); + expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true); + expect(diagnostics.events[1]?.event).toMatchObject({ + type: "harness.run.completed", + runId: "run-1", + sessionKey: "session-key", + sessionId: "session-1", + provider: "codex", + model: "gpt-5.4", + channel: "qa", + trigger: "manual", + harnessId: "codex", + pluginId: "codex-plugin", + outcome: "completed", + resultClassification: "reasoning-only", + yieldDetected: true, + itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 }, + durationMs: expect.any(Number), + }); + }); + + it("emits trusted harness error diagnostics with the failing lifecycle phase", async () => { + resetDiagnosticEventsForTest(); + const params = createAttemptParams(); + const sendError = new Error("codex app-server send failed"); + const harness: AgentHarnessV2 = { + id: "codex", + label: "Codex", + supports: () => ({ supported: true }), + prepare: async () => ({ + harnessId: "codex", + label: "Codex", + params, + lifecycleState: "prepared", + }), + start: async (prepared) => ({ ...prepared, lifecycleState: "started" }), + send: async () => { + throw sendError; + }, + resolveOutcome: async (_session, rawResult) => rawResult, + cleanup: async () => { + throw new Error("cleanup failed"); + }, + }; + const diagnostics = captureDiagnosticEvents(); + try { + await expect(runAgentHarnessV2LifecycleAttempt(harness, params)).rejects.toThrow( + "codex app-server send failed", + ); + await flushDiagnosticEvents(); + } finally { + diagnostics.unsubscribe(); + } + + expect(diagnostics.events.map(({ event }) => event.type)).toEqual([ + "harness.run.started", + "harness.run.error", + ]); + expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true); + expect(diagnostics.events[1]?.event).toMatchObject({ + type: "harness.run.error", + phase: "send", + errorCategory: "Error", + cleanupFailed: true, + harnessId: "codex", + durationMs: expect.any(Number), + }); + }); + it("runs cleanup with the original failure and preserves that failure", async () => { const params = createAttemptParams(); const sendError = new Error("codex app-server send failed"); diff --git a/src/agents/harness/v2.ts b/src/agents/harness/v2.ts index 0993bddda1f..bf14a314cdc 100644 --- a/src/agents/harness/v2.ts +++ b/src/agents/harness/v2.ts @@ -1,3 +1,10 @@ +import { diagnosticErrorCategory } from "../../infra/diagnostic-error-metadata.js"; +import { + emitTrustedDiagnosticEvent, + type DiagnosticHarnessRunErrorEvent, + type DiagnosticHarnessRunOutcome, +} from "../../infra/diagnostic-events.js"; +import type { DiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js"; import { formatErrorMessage } from "../../infra/errors.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { applyAgentHarnessResultClassification } from "./result-classification.js"; @@ -13,6 +20,7 @@ import type { } from "./types.js"; const log = createSubsystemLogger("agents/harness/v2"); +type AgentHarnessV2LifecyclePhase = DiagnosticHarnessRunErrorEvent["phase"]; type AgentHarnessV2RunBase = { harnessId: string; @@ -95,6 +103,87 @@ export function adaptAgentHarnessToV2(harness: AgentHarness): AgentHarnessV2 { }; } +function agentHarnessDiagnosticBase( + harness: AgentHarnessV2, + params: AgentHarnessAttemptParams, + trace?: DiagnosticTraceContext, +) { + return { + runId: params.runId, + sessionId: params.sessionId, + provider: params.provider, + model: params.modelId, + harnessId: harness.id, + ...(harness.pluginId ? { pluginId: harness.pluginId } : {}), + ...(params.sessionKey ? { sessionKey: params.sessionKey } : {}), + ...(params.trigger ? { trigger: params.trigger } : {}), + ...(params.messageChannel ? { channel: params.messageChannel } : {}), + ...(trace ? { trace } : {}), + }; +} + +function agentHarnessRunOutcome(result: AgentHarnessAttemptResult): DiagnosticHarnessRunOutcome { + if (result.promptError) { + return "error"; + } + if (result.externalAbort || result.aborted) { + return "aborted"; + } + if (result.timedOut || result.idleTimedOut || result.timedOutDuringCompaction) { + return "timed_out"; + } + return "completed"; +} + +function emitAgentHarnessRunStarted( + harness: AgentHarnessV2, + params: AgentHarnessAttemptParams, +): void { + emitTrustedDiagnosticEvent({ + type: "harness.run.started", + ...agentHarnessDiagnosticBase(harness, params), + }); +} + +function emitAgentHarnessRunCompleted(params: { + harness: AgentHarnessV2; + attemptParams: AgentHarnessAttemptParams; + result: AgentHarnessAttemptResult; + startedAt: number; +}): void { + const { harness, attemptParams, result, startedAt } = params; + emitTrustedDiagnosticEvent({ + type: "harness.run.completed", + ...agentHarnessDiagnosticBase(harness, attemptParams, result.diagnosticTrace), + durationMs: Date.now() - startedAt, + outcome: agentHarnessRunOutcome(result), + ...(result.agentHarnessResultClassification + ? { resultClassification: result.agentHarnessResultClassification } + : {}), + ...(typeof result.yieldDetected === "boolean" ? { yieldDetected: result.yieldDetected } : {}), + itemLifecycle: { ...result.itemLifecycle }, + }); +} + +function emitAgentHarnessRunError(params: { + harness: AgentHarnessV2; + attemptParams: AgentHarnessAttemptParams; + startedAt: number; + phase: AgentHarnessV2LifecyclePhase; + error: unknown; + cleanupFailed?: boolean; +}): void { + const { harness, attemptParams, startedAt, phase, error, cleanupFailed } = params; + emitTrustedDiagnosticEvent({ + type: "harness.run.error", + ...agentHarnessDiagnosticBase(harness, attemptParams), + durationMs: Date.now() - startedAt, + phase, + errorCategory: diagnosticErrorCategory(error), + ...(cleanupFailed ? { cleanupFailed: true } : {}), + }); +} + export async function runAgentHarnessV2LifecycleAttempt( harness: AgentHarnessV2, params: AgentHarnessAttemptParams, @@ -103,13 +192,21 @@ export async function runAgentHarnessV2LifecycleAttempt( let session: AgentHarnessV2Session | undefined; let rawResult: AgentHarnessAttemptResult | undefined; let result: AgentHarnessAttemptResult; + let phase: AgentHarnessV2LifecyclePhase = "prepare"; + const startedAt = Date.now(); + emitAgentHarnessRunStarted(harness, params); try { + phase = "prepare"; prepared = await harness.prepare(params); + phase = "start"; session = await harness.start(prepared); + phase = "send"; rawResult = await harness.send(session); + phase = "resolve"; result = await harness.resolveOutcome(session, rawResult); } catch (error) { + let cleanupFailed = false; try { await harness.cleanup({ prepared, @@ -118,6 +215,7 @@ export async function runAgentHarnessV2LifecycleAttempt( ...(rawResult === undefined ? {} : { result: rawResult }), }); } catch (cleanupError) { + cleanupFailed = true; // Preserve the user-visible harness failure. Cleanup errors after a // failed lifecycle stage must not mask the actionable runtime error. log.warn("agent harness cleanup failed after attempt failure", { @@ -128,9 +226,30 @@ export async function runAgentHarnessV2LifecycleAttempt( originalError: formatErrorMessage(error), }); } + emitAgentHarnessRunError({ + harness, + attemptParams: params, + startedAt, + phase, + error, + cleanupFailed, + }); throw error; } - await harness.cleanup({ prepared, session, result }); + try { + phase = "cleanup"; + await harness.cleanup({ prepared, session, result }); + } catch (error) { + emitAgentHarnessRunError({ + harness, + attemptParams: params, + startedAt, + phase, + error, + }); + throw error; + } + emitAgentHarnessRunCompleted({ harness, attemptParams: params, result, startedAt }); return result; } diff --git a/src/infra/diagnostic-events.ts b/src/infra/diagnostic-events.ts index b93b44d9b64..d6ed9ebd091 100644 --- a/src/infra/diagnostic-events.ts +++ b/src/infra/diagnostic-events.ts @@ -256,6 +256,47 @@ export type DiagnosticRunCompletedEvent = DiagnosticRunBaseEvent & { errorCategory?: string; }; +export type DiagnosticHarnessRunPhase = "prepare" | "start" | "send" | "resolve" | "cleanup"; +export type DiagnosticHarnessRunOutcome = "completed" | "aborted" | "timed_out" | "error"; + +type DiagnosticHarnessRunBaseEvent = DiagnosticBaseEvent & { + type: "harness.run.started" | "harness.run.completed" | "harness.run.error"; + runId: string; + sessionKey?: string; + sessionId?: string; + provider?: string; + model?: string; + trigger?: string; + channel?: string; + harnessId: string; + pluginId?: string; +}; + +export type DiagnosticHarnessRunStartedEvent = DiagnosticHarnessRunBaseEvent & { + type: "harness.run.started"; +}; + +export type DiagnosticHarnessRunCompletedEvent = DiagnosticHarnessRunBaseEvent & { + type: "harness.run.completed"; + durationMs: number; + outcome: DiagnosticHarnessRunOutcome; + resultClassification?: "empty" | "reasoning-only" | "planning-only"; + yieldDetected?: boolean; + itemLifecycle?: { + startedCount: number; + completedCount: number; + activeCount: number; + }; +}; + +export type DiagnosticHarnessRunErrorEvent = DiagnosticHarnessRunBaseEvent & { + type: "harness.run.error"; + durationMs: number; + phase: DiagnosticHarnessRunPhase; + errorCategory: string; + cleanupFailed?: boolean; +}; + type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & { type: "model.call.started" | "model.call.completed" | "model.call.error"; runId: string; @@ -392,6 +433,9 @@ export type DiagnosticEventPayload = | DiagnosticExecProcessCompletedEvent | DiagnosticRunStartedEvent | DiagnosticRunCompletedEvent + | DiagnosticHarnessRunStartedEvent + | DiagnosticHarnessRunCompletedEvent + | DiagnosticHarnessRunErrorEvent | DiagnosticModelCallStartedEvent | DiagnosticModelCallCompletedEvent | DiagnosticModelCallErrorEvent @@ -446,6 +490,9 @@ const ASYNC_DIAGNOSTIC_EVENT_TYPES = new Set([ "model.call.started", "model.call.completed", "model.call.error", + "harness.run.started", + "harness.run.completed", + "harness.run.error", "context.assembled", "log.record", ]); diff --git a/src/logging/diagnostic-stability.ts b/src/logging/diagnostic-stability.ts index 50a2bbad2ca..086a353fa37 100644 --- a/src/logging/diagnostic-stability.ts +++ b/src/logging/diagnostic-stability.ts @@ -305,6 +305,34 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi record.outcome = event.outcome; assignReasonCode(record, event.errorCategory); break; + case "harness.run.started": + record.source = event.harnessId; + record.pluginId = event.pluginId; + record.provider = event.provider; + record.model = event.model; + record.channel = event.channel; + break; + case "harness.run.completed": + record.source = event.harnessId; + record.pluginId = event.pluginId; + record.provider = event.provider; + record.model = event.model; + record.channel = event.channel; + record.durationMs = event.durationMs; + record.outcome = event.outcome; + record.count = event.itemLifecycle?.completedCount; + break; + case "harness.run.error": + record.source = event.harnessId; + record.pluginId = event.pluginId; + record.provider = event.provider; + record.model = event.model; + record.channel = event.channel; + record.durationMs = event.durationMs; + record.outcome = "error"; + record.action = event.phase; + assignReasonCode(record, event.errorCategory); + break; case "model.call.started": record.provider = event.provider; record.model = event.model;