mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 07:30:43 +00:00
feat(diagnostics): add harness lifecycle telemetry
This commit is contained in:
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc.
|
- Diagnostics/OTEL: align model-call GenAI span attributes with OpenTelemetry stability opt-in semantics, keeping legacy `gen_ai.system` by default while emitting `gen_ai.provider.name` under `OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`. Thanks @vincentkoc.
|
||||||
- Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc.
|
- Diagnostics/OTEL: support signal-specific OTLP endpoint overrides for traces, metrics, and logs via config or standard OTEL environment variables. Thanks @vincentkoc.
|
||||||
- Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
|
- Diagnostics/OTEL: emit bounded telemetry exporter health diagnostics for startup and log-export failures without exporting raw error text. Thanks @vincentkoc.
|
||||||
|
- Diagnostics/OTEL: export agent harness lifecycle telemetry as bounded `openclaw.harness.run` spans and `openclaw.harness.duration_ms` metrics so QA-lab, Codex, and future harnesses share one trace shape. Thanks @vincentkoc.
|
||||||
- Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
|
- Plugins/CLI: add `openclaw plugins registry` for explicit persisted-registry inspection and `--refresh` repair without making normal startup rescan plugin locations. Thanks @vincentkoc.
|
||||||
- Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
|
- Plugins/CLI: make `openclaw plugins list` read the cold persisted registry snapshot by default, leaving module-aware diagnostics to `plugins doctor` and `plugins inspect`. Thanks @vincentkoc.
|
||||||
- Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.
|
- Plugins/startup: move gateway startup plugin planning onto the versioned cold registry index, with postinstall repair for older registry files that predate startup metadata. Thanks @vincentkoc.
|
||||||
|
|||||||
@@ -59,9 +59,9 @@ pnpm qa:otel:smoke
|
|||||||
That script starts a local OTLP/HTTP trace receiver, runs the
|
That script starts a local OTLP/HTTP trace receiver, runs the
|
||||||
`otel-trace-smoke` QA scenario with the `diagnostics-otel` plugin enabled, then
|
`otel-trace-smoke` QA scenario with the `diagnostics-otel` plugin enabled, then
|
||||||
decodes the exported protobuf spans and asserts the release-critical shape:
|
decodes the exported protobuf spans and asserts the release-critical shape:
|
||||||
`openclaw.run`, `openclaw.model.call`, `openclaw.context.assembled`, and
|
`openclaw.run`, `openclaw.harness.run`, `openclaw.model.call`,
|
||||||
`openclaw.message.delivery` must be present; model calls must not export
|
`openclaw.context.assembled`, and `openclaw.message.delivery` must be present;
|
||||||
`StreamAbandoned` on successful turns; raw diagnostic IDs and
|
model calls must not export `StreamAbandoned` on successful turns; raw diagnostic IDs and
|
||||||
`openclaw.content.*` attributes must stay out of the trace. It writes
|
`openclaw.content.*` attributes must stay out of the trace. It writes
|
||||||
`otel-smoke-summary.json` next to the QA suite artifacts.
|
`otel-smoke-summary.json` next to the QA suite artifacts.
|
||||||
|
|
||||||
|
|||||||
@@ -1140,6 +1140,28 @@ describe("diagnostics-otel service", () => {
|
|||||||
traceFlags: "01",
|
traceFlags: "01",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
emitDiagnosticEvent({
|
||||||
|
type: "harness.run.completed",
|
||||||
|
runId: "run-1",
|
||||||
|
sessionKey: "session-key",
|
||||||
|
sessionId: "session-1",
|
||||||
|
provider: "codex",
|
||||||
|
model: "gpt-5.4",
|
||||||
|
channel: "qa",
|
||||||
|
harnessId: "codex",
|
||||||
|
pluginId: "codex-plugin",
|
||||||
|
outcome: "completed",
|
||||||
|
durationMs: 90,
|
||||||
|
resultClassification: "reasoning-only",
|
||||||
|
yieldDetected: true,
|
||||||
|
itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
|
||||||
|
trace: {
|
||||||
|
traceId: TRACE_ID,
|
||||||
|
spanId: GRANDCHILD_SPAN_ID,
|
||||||
|
parentSpanId: CHILD_SPAN_ID,
|
||||||
|
traceFlags: "01",
|
||||||
|
},
|
||||||
|
});
|
||||||
emitDiagnosticEvent({
|
emitDiagnosticEvent({
|
||||||
type: "tool.execution.error",
|
type: "tool.execution.error",
|
||||||
runId: "run-1",
|
runId: "run-1",
|
||||||
@@ -1160,7 +1182,12 @@ describe("diagnostics-otel service", () => {
|
|||||||
|
|
||||||
const spanNames = telemetryState.tracer.startSpan.mock.calls.map((call) => call[0]);
|
const spanNames = telemetryState.tracer.startSpan.mock.calls.map((call) => call[0]);
|
||||||
expect(spanNames).toEqual(
|
expect(spanNames).toEqual(
|
||||||
expect.arrayContaining(["openclaw.run", "openclaw.model.call", "openclaw.tool.execution"]),
|
expect.arrayContaining([
|
||||||
|
"openclaw.run",
|
||||||
|
"openclaw.model.call",
|
||||||
|
"openclaw.harness.run",
|
||||||
|
"openclaw.tool.execution",
|
||||||
|
]),
|
||||||
);
|
);
|
||||||
|
|
||||||
const runCall = telemetryState.tracer.startSpan.mock.calls.find(
|
const runCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||||
@@ -1207,6 +1234,36 @@ describe("diagnostics-otel service", () => {
|
|||||||
});
|
});
|
||||||
expect(modelCall?.[2]).toBeUndefined();
|
expect(modelCall?.[2]).toBeUndefined();
|
||||||
|
|
||||||
|
const harnessCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||||
|
(call) => call[0] === "openclaw.harness.run",
|
||||||
|
);
|
||||||
|
expect(harnessCall?.[1]).toMatchObject({
|
||||||
|
attributes: {
|
||||||
|
"openclaw.harness.id": "codex",
|
||||||
|
"openclaw.harness.plugin": "codex-plugin",
|
||||||
|
"openclaw.outcome": "completed",
|
||||||
|
"openclaw.provider": "codex",
|
||||||
|
"openclaw.model": "gpt-5.4",
|
||||||
|
"openclaw.channel": "qa",
|
||||||
|
"openclaw.harness.result_classification": "reasoning-only",
|
||||||
|
"openclaw.harness.yield_detected": true,
|
||||||
|
"openclaw.harness.items.started": 3,
|
||||||
|
"openclaw.harness.items.completed": 2,
|
||||||
|
"openclaw.harness.items.active": 1,
|
||||||
|
},
|
||||||
|
startTime: expect.any(Number),
|
||||||
|
});
|
||||||
|
expect(harnessCall?.[1]).toEqual({
|
||||||
|
attributes: expect.not.objectContaining({
|
||||||
|
"openclaw.runId": expect.anything(),
|
||||||
|
"openclaw.sessionId": expect.anything(),
|
||||||
|
"openclaw.sessionKey": expect.anything(),
|
||||||
|
"openclaw.traceId": expect.anything(),
|
||||||
|
}),
|
||||||
|
startTime: expect.any(Number),
|
||||||
|
});
|
||||||
|
expect(harnessCall?.[2]).toBeUndefined();
|
||||||
|
|
||||||
const toolCall = telemetryState.tracer.startSpan.mock.calls.find(
|
const toolCall = telemetryState.tracer.startSpan.mock.calls.find(
|
||||||
(call) => call[0] === "openclaw.tool.execution",
|
(call) => call[0] === "openclaw.tool.execution",
|
||||||
);
|
);
|
||||||
@@ -1244,6 +1301,25 @@ describe("diagnostics-otel service", () => {
|
|||||||
"openclaw.runId": expect.anything(),
|
"openclaw.runId": expect.anything(),
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
|
expect(
|
||||||
|
telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
|
||||||
|
).toHaveBeenCalledWith(
|
||||||
|
90,
|
||||||
|
expect.objectContaining({
|
||||||
|
"openclaw.harness.id": "codex",
|
||||||
|
"openclaw.harness.plugin": "codex-plugin",
|
||||||
|
"openclaw.outcome": "completed",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
expect(
|
||||||
|
telemetryState.histograms.get("openclaw.harness.duration_ms")?.record,
|
||||||
|
).toHaveBeenCalledWith(
|
||||||
|
90,
|
||||||
|
expect.not.objectContaining({
|
||||||
|
"openclaw.runId": expect.anything(),
|
||||||
|
"openclaw.sessionKey": expect.anything(),
|
||||||
|
}),
|
||||||
|
);
|
||||||
expect(
|
expect(
|
||||||
telemetryState.histograms.get("openclaw.tool.execution.duration_ms")?.record,
|
telemetryState.histograms.get("openclaw.tool.execution.duration_ms")?.record,
|
||||||
).toHaveBeenCalledWith(
|
).toHaveBeenCalledWith(
|
||||||
|
|||||||
@@ -81,6 +81,10 @@ type ModelCallLifecycleDiagnosticEvent = Extract<
|
|||||||
DiagnosticEventPayload,
|
DiagnosticEventPayload,
|
||||||
{ type: "model.call.completed" | "model.call.error" }
|
{ type: "model.call.completed" | "model.call.error" }
|
||||||
>;
|
>;
|
||||||
|
type HarnessRunLifecycleDiagnosticEvent = Extract<
|
||||||
|
DiagnosticEventPayload,
|
||||||
|
{ type: "harness.run.completed" | "harness.run.error" }
|
||||||
|
>;
|
||||||
type TelemetryExporterDiagnosticEvent = Extract<
|
type TelemetryExporterDiagnosticEvent = Extract<
|
||||||
DiagnosticEventPayload,
|
DiagnosticEventPayload,
|
||||||
{ type: "telemetry.exporter" }
|
{ type: "telemetry.exporter" }
|
||||||
@@ -720,6 +724,10 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
unit: "ms",
|
unit: "ms",
|
||||||
description: "Agent run duration",
|
description: "Agent run duration",
|
||||||
});
|
});
|
||||||
|
const harnessDurationHistogram = meter.createHistogram("openclaw.harness.duration_ms", {
|
||||||
|
unit: "ms",
|
||||||
|
description: "Agent harness lifecycle duration",
|
||||||
|
});
|
||||||
const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
|
const contextHistogram = meter.createHistogram("openclaw.context.tokens", {
|
||||||
unit: "1",
|
unit: "1",
|
||||||
description: "Context window size and usage",
|
description: "Context window size and usage",
|
||||||
@@ -1426,6 +1434,82 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
span.end(evt.ts);
|
span.end(evt.ts);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const harnessRunMetricAttrs = (evt: HarnessRunLifecycleDiagnosticEvent) => ({
|
||||||
|
"openclaw.harness.id": lowCardinalityAttr(evt.harnessId, "unknown"),
|
||||||
|
"openclaw.harness.plugin": lowCardinalityAttr(evt.pluginId),
|
||||||
|
"openclaw.outcome": evt.type === "harness.run.error" ? "error" : evt.outcome,
|
||||||
|
"openclaw.provider": lowCardinalityAttr(evt.provider, "unknown"),
|
||||||
|
"openclaw.model": lowCardinalityAttr(evt.model, "unknown"),
|
||||||
|
...(evt.channel ? { "openclaw.channel": lowCardinalityAttr(evt.channel) } : {}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const recordHarnessRunCompleted = (
|
||||||
|
evt: Extract<DiagnosticEventPayload, { type: "harness.run.completed" }>,
|
||||||
|
metadata: DiagnosticEventMetadata,
|
||||||
|
) => {
|
||||||
|
harnessDurationHistogram.record(evt.durationMs, harnessRunMetricAttrs(evt));
|
||||||
|
if (!tracesEnabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const spanAttrs: Record<string, string | number | boolean> = {
|
||||||
|
...harnessRunMetricAttrs(evt),
|
||||||
|
};
|
||||||
|
if (evt.resultClassification) {
|
||||||
|
spanAttrs["openclaw.harness.result_classification"] = lowCardinalityAttr(
|
||||||
|
evt.resultClassification,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (typeof evt.yieldDetected === "boolean") {
|
||||||
|
spanAttrs["openclaw.harness.yield_detected"] = evt.yieldDetected;
|
||||||
|
}
|
||||||
|
if (evt.itemLifecycle) {
|
||||||
|
spanAttrs["openclaw.harness.items.started"] = evt.itemLifecycle.startedCount;
|
||||||
|
spanAttrs["openclaw.harness.items.completed"] = evt.itemLifecycle.completedCount;
|
||||||
|
spanAttrs["openclaw.harness.items.active"] = evt.itemLifecycle.activeCount;
|
||||||
|
}
|
||||||
|
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||||
|
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||||
|
endTimeMs: evt.ts,
|
||||||
|
});
|
||||||
|
if (evt.outcome === "error") {
|
||||||
|
span.setStatus({
|
||||||
|
code: SpanStatusCode.ERROR,
|
||||||
|
message: "error",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
span.end(evt.ts);
|
||||||
|
};
|
||||||
|
|
||||||
|
const recordHarnessRunError = (
|
||||||
|
evt: Extract<DiagnosticEventPayload, { type: "harness.run.error" }>,
|
||||||
|
metadata: DiagnosticEventMetadata,
|
||||||
|
) => {
|
||||||
|
const errorType = lowCardinalityAttr(evt.errorCategory, "other");
|
||||||
|
const attrs = {
|
||||||
|
...harnessRunMetricAttrs(evt),
|
||||||
|
"openclaw.harness.phase": evt.phase,
|
||||||
|
"openclaw.errorCategory": errorType,
|
||||||
|
};
|
||||||
|
harnessDurationHistogram.record(evt.durationMs, attrs);
|
||||||
|
if (!tracesEnabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const spanAttrs: Record<string, string | number | boolean> = {
|
||||||
|
...attrs,
|
||||||
|
"error.type": errorType,
|
||||||
|
...(evt.cleanupFailed ? { "openclaw.harness.cleanup_failed": true } : {}),
|
||||||
|
};
|
||||||
|
const span = spanWithDuration("openclaw.harness.run", spanAttrs, evt.durationMs, {
|
||||||
|
parentContext: contextForTrustedDiagnosticSpanParent(evt, metadata),
|
||||||
|
endTimeMs: evt.ts,
|
||||||
|
});
|
||||||
|
span.setStatus({
|
||||||
|
code: SpanStatusCode.ERROR,
|
||||||
|
message: errorType,
|
||||||
|
});
|
||||||
|
span.end(evt.ts);
|
||||||
|
};
|
||||||
|
|
||||||
const recordContextAssembled = (
|
const recordContextAssembled = (
|
||||||
evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
|
evt: Extract<DiagnosticEventPayload, { type: "context.assembled" }>,
|
||||||
metadata: DiagnosticEventMetadata,
|
metadata: DiagnosticEventMetadata,
|
||||||
@@ -1746,6 +1830,12 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
case "run.completed":
|
case "run.completed":
|
||||||
recordRunCompleted(evt, metadata);
|
recordRunCompleted(evt, metadata);
|
||||||
return;
|
return;
|
||||||
|
case "harness.run.completed":
|
||||||
|
recordHarnessRunCompleted(evt, metadata);
|
||||||
|
return;
|
||||||
|
case "harness.run.error":
|
||||||
|
recordHarnessRunError(evt, metadata);
|
||||||
|
return;
|
||||||
case "context.assembled":
|
case "context.assembled":
|
||||||
recordContextAssembled(evt, metadata);
|
recordContextAssembled(evt, metadata);
|
||||||
return;
|
return;
|
||||||
@@ -1781,6 +1871,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
|
|||||||
return;
|
return;
|
||||||
case "tool.execution.started":
|
case "tool.execution.started":
|
||||||
case "run.started":
|
case "run.started":
|
||||||
|
case "harness.run.started":
|
||||||
case "model.call.started":
|
case "model.call.started":
|
||||||
case "payload.large":
|
case "payload.large":
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ objective: Verify a QA-lab gateway run emits bounded OpenTelemetry trace spans t
|
|||||||
successCriteria:
|
successCriteria:
|
||||||
- The diagnostics-otel plugin starts with trace export enabled.
|
- The diagnostics-otel plugin starts with trace export enabled.
|
||||||
- A minimal QA-channel agent turn completes.
|
- A minimal QA-channel agent turn completes.
|
||||||
|
- The trace includes the selected agent harness lifecycle span.
|
||||||
- The run emits low-cardinality OpenTelemetry trace spans without content or raw diagnostic identifiers.
|
- The run emits low-cardinality OpenTelemetry trace spans without content or raw diagnostic identifiers.
|
||||||
plugins:
|
plugins:
|
||||||
- diagnostics-otel
|
- diagnostics-otel
|
||||||
@@ -33,6 +34,7 @@ docsRefs:
|
|||||||
- docs/concepts/qa-e2e-automation.md
|
- docs/concepts/qa-e2e-automation.md
|
||||||
codeRefs:
|
codeRefs:
|
||||||
- extensions/diagnostics-otel/src/service.ts
|
- extensions/diagnostics-otel/src/service.ts
|
||||||
|
- src/agents/harness/v2.ts
|
||||||
- extensions/qa-lab/src/suite.ts
|
- extensions/qa-lab/src/suite.ts
|
||||||
execution:
|
execution:
|
||||||
kind: flow
|
kind: flow
|
||||||
|
|||||||
@@ -80,6 +80,7 @@ type CapturedSpan = {
|
|||||||
const DEFAULT_SCENARIO_ID = "otel-trace-smoke";
|
const DEFAULT_SCENARIO_ID = "otel-trace-smoke";
|
||||||
const REQUIRED_SPAN_NAMES = [
|
const REQUIRED_SPAN_NAMES = [
|
||||||
"openclaw.run",
|
"openclaw.run",
|
||||||
|
"openclaw.harness.run",
|
||||||
"openclaw.model.call",
|
"openclaw.model.call",
|
||||||
"openclaw.context.assembled",
|
"openclaw.context.assembled",
|
||||||
"openclaw.message.delivery",
|
"openclaw.message.delivery",
|
||||||
|
|||||||
@@ -1,5 +1,11 @@
|
|||||||
import type { Api, Model } from "@mariozechner/pi-ai";
|
import type { Api, Model } from "@mariozechner/pi-ai";
|
||||||
import { describe, expect, it, vi } from "vitest";
|
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||||
|
import {
|
||||||
|
onInternalDiagnosticEvent,
|
||||||
|
resetDiagnosticEventsForTest,
|
||||||
|
type DiagnosticEventMetadata,
|
||||||
|
type DiagnosticEventPayload,
|
||||||
|
} from "../../infra/diagnostic-events.js";
|
||||||
import type { EmbeddedRunAttemptResult } from "../pi-embedded-runner/run/types.js";
|
import type { EmbeddedRunAttemptResult } from "../pi-embedded-runner/run/types.js";
|
||||||
import type { AgentHarness, AgentHarnessAttemptParams } from "./types.js";
|
import type { AgentHarness, AgentHarnessAttemptParams } from "./types.js";
|
||||||
import type { AgentHarnessV2 } from "./v2.js";
|
import type { AgentHarnessV2 } from "./v2.js";
|
||||||
@@ -9,6 +15,7 @@ function createAttemptParams(): AgentHarnessAttemptParams {
|
|||||||
return {
|
return {
|
||||||
prompt: "hello",
|
prompt: "hello",
|
||||||
sessionId: "session-1",
|
sessionId: "session-1",
|
||||||
|
sessionKey: "session-key",
|
||||||
runId: "run-1",
|
runId: "run-1",
|
||||||
sessionFile: "/tmp/session.jsonl",
|
sessionFile: "/tmp/session.jsonl",
|
||||||
workspaceDir: "/tmp/workspace",
|
workspaceDir: "/tmp/workspace",
|
||||||
@@ -19,9 +26,19 @@ function createAttemptParams(): AgentHarnessAttemptParams {
|
|||||||
authStorage: {} as never,
|
authStorage: {} as never,
|
||||||
modelRegistry: {} as never,
|
modelRegistry: {} as never,
|
||||||
thinkLevel: "low",
|
thinkLevel: "low",
|
||||||
|
messageChannel: "qa",
|
||||||
|
trigger: "manual",
|
||||||
} as AgentHarnessAttemptParams;
|
} as AgentHarnessAttemptParams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function createDiagnosticTrace() {
|
||||||
|
return {
|
||||||
|
traceId: "11111111111111111111111111111111",
|
||||||
|
spanId: "2222222222222222",
|
||||||
|
traceFlags: "01",
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
function createAttemptResult(): EmbeddedRunAttemptResult {
|
function createAttemptResult(): EmbeddedRunAttemptResult {
|
||||||
return {
|
return {
|
||||||
aborted: false,
|
aborted: false,
|
||||||
@@ -32,6 +49,7 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
|
|||||||
promptError: null,
|
promptError: null,
|
||||||
promptErrorSource: null,
|
promptErrorSource: null,
|
||||||
sessionIdUsed: "session-1",
|
sessionIdUsed: "session-1",
|
||||||
|
diagnosticTrace: createDiagnosticTrace(),
|
||||||
messagesSnapshot: [],
|
messagesSnapshot: [],
|
||||||
assistantTexts: ["ok"],
|
assistantTexts: ["ok"],
|
||||||
toolMetas: [],
|
toolMetas: [],
|
||||||
@@ -46,7 +64,28 @@ function createAttemptResult(): EmbeddedRunAttemptResult {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function flushDiagnosticEvents(): Promise<void> {
|
||||||
|
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||||
|
}
|
||||||
|
|
||||||
|
function captureDiagnosticEvents(): {
|
||||||
|
events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }>;
|
||||||
|
unsubscribe: () => void;
|
||||||
|
} {
|
||||||
|
const events: Array<{ event: DiagnosticEventPayload; metadata: DiagnosticEventMetadata }> = [];
|
||||||
|
const unsubscribe = onInternalDiagnosticEvent((event, metadata) => {
|
||||||
|
if (event.type.startsWith("harness.run.")) {
|
||||||
|
events.push({ event, metadata });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return { events, unsubscribe };
|
||||||
|
}
|
||||||
|
|
||||||
describe("AgentHarness V2 compatibility adapter", () => {
|
describe("AgentHarness V2 compatibility adapter", () => {
|
||||||
|
afterEach(() => {
|
||||||
|
resetDiagnosticEventsForTest();
|
||||||
|
});
|
||||||
|
|
||||||
it("executes prepare/start/send/outcome/cleanup as one bounded lifecycle", async () => {
|
it("executes prepare/start/send/outcome/cleanup as one bounded lifecycle", async () => {
|
||||||
const params = createAttemptParams();
|
const params = createAttemptParams();
|
||||||
const result = createAttemptResult();
|
const result = createAttemptResult();
|
||||||
@@ -102,6 +141,112 @@ describe("AgentHarness V2 compatibility adapter", () => {
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("emits trusted harness lifecycle diagnostics for successful attempts", async () => {
|
||||||
|
resetDiagnosticEventsForTest();
|
||||||
|
const params = createAttemptParams();
|
||||||
|
const result = {
|
||||||
|
...createAttemptResult(),
|
||||||
|
agentHarnessResultClassification: "reasoning-only",
|
||||||
|
yieldDetected: true,
|
||||||
|
itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
|
||||||
|
} as EmbeddedRunAttemptResult;
|
||||||
|
const harness: AgentHarnessV2 = {
|
||||||
|
id: "codex",
|
||||||
|
label: "Codex",
|
||||||
|
pluginId: "codex-plugin",
|
||||||
|
supports: () => ({ supported: true }),
|
||||||
|
prepare: async () => ({
|
||||||
|
harnessId: "codex",
|
||||||
|
label: "Codex",
|
||||||
|
pluginId: "codex-plugin",
|
||||||
|
params,
|
||||||
|
lifecycleState: "prepared",
|
||||||
|
}),
|
||||||
|
start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
|
||||||
|
send: async () => result,
|
||||||
|
resolveOutcome: async (_session, rawResult) => rawResult,
|
||||||
|
cleanup: async () => {},
|
||||||
|
};
|
||||||
|
const diagnostics = captureDiagnosticEvents();
|
||||||
|
try {
|
||||||
|
await runAgentHarnessV2LifecycleAttempt(harness, params);
|
||||||
|
await flushDiagnosticEvents();
|
||||||
|
} finally {
|
||||||
|
diagnostics.unsubscribe();
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
|
||||||
|
"harness.run.started",
|
||||||
|
"harness.run.completed",
|
||||||
|
]);
|
||||||
|
expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
|
||||||
|
expect(diagnostics.events[1]?.event).toMatchObject({
|
||||||
|
type: "harness.run.completed",
|
||||||
|
runId: "run-1",
|
||||||
|
sessionKey: "session-key",
|
||||||
|
sessionId: "session-1",
|
||||||
|
provider: "codex",
|
||||||
|
model: "gpt-5.4",
|
||||||
|
channel: "qa",
|
||||||
|
trigger: "manual",
|
||||||
|
harnessId: "codex",
|
||||||
|
pluginId: "codex-plugin",
|
||||||
|
outcome: "completed",
|
||||||
|
resultClassification: "reasoning-only",
|
||||||
|
yieldDetected: true,
|
||||||
|
itemLifecycle: { startedCount: 3, completedCount: 2, activeCount: 1 },
|
||||||
|
durationMs: expect.any(Number),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("emits trusted harness error diagnostics with the failing lifecycle phase", async () => {
|
||||||
|
resetDiagnosticEventsForTest();
|
||||||
|
const params = createAttemptParams();
|
||||||
|
const sendError = new Error("codex app-server send failed");
|
||||||
|
const harness: AgentHarnessV2 = {
|
||||||
|
id: "codex",
|
||||||
|
label: "Codex",
|
||||||
|
supports: () => ({ supported: true }),
|
||||||
|
prepare: async () => ({
|
||||||
|
harnessId: "codex",
|
||||||
|
label: "Codex",
|
||||||
|
params,
|
||||||
|
lifecycleState: "prepared",
|
||||||
|
}),
|
||||||
|
start: async (prepared) => ({ ...prepared, lifecycleState: "started" }),
|
||||||
|
send: async () => {
|
||||||
|
throw sendError;
|
||||||
|
},
|
||||||
|
resolveOutcome: async (_session, rawResult) => rawResult,
|
||||||
|
cleanup: async () => {
|
||||||
|
throw new Error("cleanup failed");
|
||||||
|
},
|
||||||
|
};
|
||||||
|
const diagnostics = captureDiagnosticEvents();
|
||||||
|
try {
|
||||||
|
await expect(runAgentHarnessV2LifecycleAttempt(harness, params)).rejects.toThrow(
|
||||||
|
"codex app-server send failed",
|
||||||
|
);
|
||||||
|
await flushDiagnosticEvents();
|
||||||
|
} finally {
|
||||||
|
diagnostics.unsubscribe();
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(diagnostics.events.map(({ event }) => event.type)).toEqual([
|
||||||
|
"harness.run.started",
|
||||||
|
"harness.run.error",
|
||||||
|
]);
|
||||||
|
expect(diagnostics.events.every(({ metadata }) => metadata.trusted)).toBe(true);
|
||||||
|
expect(diagnostics.events[1]?.event).toMatchObject({
|
||||||
|
type: "harness.run.error",
|
||||||
|
phase: "send",
|
||||||
|
errorCategory: "Error",
|
||||||
|
cleanupFailed: true,
|
||||||
|
harnessId: "codex",
|
||||||
|
durationMs: expect.any(Number),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("runs cleanup with the original failure and preserves that failure", async () => {
|
it("runs cleanup with the original failure and preserves that failure", async () => {
|
||||||
const params = createAttemptParams();
|
const params = createAttemptParams();
|
||||||
const sendError = new Error("codex app-server send failed");
|
const sendError = new Error("codex app-server send failed");
|
||||||
|
|||||||
@@ -1,3 +1,10 @@
|
|||||||
|
import { diagnosticErrorCategory } from "../../infra/diagnostic-error-metadata.js";
|
||||||
|
import {
|
||||||
|
emitTrustedDiagnosticEvent,
|
||||||
|
type DiagnosticHarnessRunErrorEvent,
|
||||||
|
type DiagnosticHarnessRunOutcome,
|
||||||
|
} from "../../infra/diagnostic-events.js";
|
||||||
|
import type { DiagnosticTraceContext } from "../../infra/diagnostic-trace-context.js";
|
||||||
import { formatErrorMessage } from "../../infra/errors.js";
|
import { formatErrorMessage } from "../../infra/errors.js";
|
||||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||||
import { applyAgentHarnessResultClassification } from "./result-classification.js";
|
import { applyAgentHarnessResultClassification } from "./result-classification.js";
|
||||||
@@ -13,6 +20,7 @@ import type {
|
|||||||
} from "./types.js";
|
} from "./types.js";
|
||||||
|
|
||||||
const log = createSubsystemLogger("agents/harness/v2");
|
const log = createSubsystemLogger("agents/harness/v2");
|
||||||
|
type AgentHarnessV2LifecyclePhase = DiagnosticHarnessRunErrorEvent["phase"];
|
||||||
|
|
||||||
type AgentHarnessV2RunBase = {
|
type AgentHarnessV2RunBase = {
|
||||||
harnessId: string;
|
harnessId: string;
|
||||||
@@ -95,6 +103,87 @@ export function adaptAgentHarnessToV2(harness: AgentHarness): AgentHarnessV2 {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function agentHarnessDiagnosticBase(
|
||||||
|
harness: AgentHarnessV2,
|
||||||
|
params: AgentHarnessAttemptParams,
|
||||||
|
trace?: DiagnosticTraceContext,
|
||||||
|
) {
|
||||||
|
return {
|
||||||
|
runId: params.runId,
|
||||||
|
sessionId: params.sessionId,
|
||||||
|
provider: params.provider,
|
||||||
|
model: params.modelId,
|
||||||
|
harnessId: harness.id,
|
||||||
|
...(harness.pluginId ? { pluginId: harness.pluginId } : {}),
|
||||||
|
...(params.sessionKey ? { sessionKey: params.sessionKey } : {}),
|
||||||
|
...(params.trigger ? { trigger: params.trigger } : {}),
|
||||||
|
...(params.messageChannel ? { channel: params.messageChannel } : {}),
|
||||||
|
...(trace ? { trace } : {}),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function agentHarnessRunOutcome(result: AgentHarnessAttemptResult): DiagnosticHarnessRunOutcome {
|
||||||
|
if (result.promptError) {
|
||||||
|
return "error";
|
||||||
|
}
|
||||||
|
if (result.externalAbort || result.aborted) {
|
||||||
|
return "aborted";
|
||||||
|
}
|
||||||
|
if (result.timedOut || result.idleTimedOut || result.timedOutDuringCompaction) {
|
||||||
|
return "timed_out";
|
||||||
|
}
|
||||||
|
return "completed";
|
||||||
|
}
|
||||||
|
|
||||||
|
function emitAgentHarnessRunStarted(
|
||||||
|
harness: AgentHarnessV2,
|
||||||
|
params: AgentHarnessAttemptParams,
|
||||||
|
): void {
|
||||||
|
emitTrustedDiagnosticEvent({
|
||||||
|
type: "harness.run.started",
|
||||||
|
...agentHarnessDiagnosticBase(harness, params),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function emitAgentHarnessRunCompleted(params: {
|
||||||
|
harness: AgentHarnessV2;
|
||||||
|
attemptParams: AgentHarnessAttemptParams;
|
||||||
|
result: AgentHarnessAttemptResult;
|
||||||
|
startedAt: number;
|
||||||
|
}): void {
|
||||||
|
const { harness, attemptParams, result, startedAt } = params;
|
||||||
|
emitTrustedDiagnosticEvent({
|
||||||
|
type: "harness.run.completed",
|
||||||
|
...agentHarnessDiagnosticBase(harness, attemptParams, result.diagnosticTrace),
|
||||||
|
durationMs: Date.now() - startedAt,
|
||||||
|
outcome: agentHarnessRunOutcome(result),
|
||||||
|
...(result.agentHarnessResultClassification
|
||||||
|
? { resultClassification: result.agentHarnessResultClassification }
|
||||||
|
: {}),
|
||||||
|
...(typeof result.yieldDetected === "boolean" ? { yieldDetected: result.yieldDetected } : {}),
|
||||||
|
itemLifecycle: { ...result.itemLifecycle },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function emitAgentHarnessRunError(params: {
|
||||||
|
harness: AgentHarnessV2;
|
||||||
|
attemptParams: AgentHarnessAttemptParams;
|
||||||
|
startedAt: number;
|
||||||
|
phase: AgentHarnessV2LifecyclePhase;
|
||||||
|
error: unknown;
|
||||||
|
cleanupFailed?: boolean;
|
||||||
|
}): void {
|
||||||
|
const { harness, attemptParams, startedAt, phase, error, cleanupFailed } = params;
|
||||||
|
emitTrustedDiagnosticEvent({
|
||||||
|
type: "harness.run.error",
|
||||||
|
...agentHarnessDiagnosticBase(harness, attemptParams),
|
||||||
|
durationMs: Date.now() - startedAt,
|
||||||
|
phase,
|
||||||
|
errorCategory: diagnosticErrorCategory(error),
|
||||||
|
...(cleanupFailed ? { cleanupFailed: true } : {}),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
export async function runAgentHarnessV2LifecycleAttempt(
|
export async function runAgentHarnessV2LifecycleAttempt(
|
||||||
harness: AgentHarnessV2,
|
harness: AgentHarnessV2,
|
||||||
params: AgentHarnessAttemptParams,
|
params: AgentHarnessAttemptParams,
|
||||||
@@ -103,13 +192,21 @@ export async function runAgentHarnessV2LifecycleAttempt(
|
|||||||
let session: AgentHarnessV2Session | undefined;
|
let session: AgentHarnessV2Session | undefined;
|
||||||
let rawResult: AgentHarnessAttemptResult | undefined;
|
let rawResult: AgentHarnessAttemptResult | undefined;
|
||||||
let result: AgentHarnessAttemptResult;
|
let result: AgentHarnessAttemptResult;
|
||||||
|
let phase: AgentHarnessV2LifecyclePhase = "prepare";
|
||||||
|
const startedAt = Date.now();
|
||||||
|
|
||||||
|
emitAgentHarnessRunStarted(harness, params);
|
||||||
try {
|
try {
|
||||||
|
phase = "prepare";
|
||||||
prepared = await harness.prepare(params);
|
prepared = await harness.prepare(params);
|
||||||
|
phase = "start";
|
||||||
session = await harness.start(prepared);
|
session = await harness.start(prepared);
|
||||||
|
phase = "send";
|
||||||
rawResult = await harness.send(session);
|
rawResult = await harness.send(session);
|
||||||
|
phase = "resolve";
|
||||||
result = await harness.resolveOutcome(session, rawResult);
|
result = await harness.resolveOutcome(session, rawResult);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
let cleanupFailed = false;
|
||||||
try {
|
try {
|
||||||
await harness.cleanup({
|
await harness.cleanup({
|
||||||
prepared,
|
prepared,
|
||||||
@@ -118,6 +215,7 @@ export async function runAgentHarnessV2LifecycleAttempt(
|
|||||||
...(rawResult === undefined ? {} : { result: rawResult }),
|
...(rawResult === undefined ? {} : { result: rawResult }),
|
||||||
});
|
});
|
||||||
} catch (cleanupError) {
|
} catch (cleanupError) {
|
||||||
|
cleanupFailed = true;
|
||||||
// Preserve the user-visible harness failure. Cleanup errors after a
|
// Preserve the user-visible harness failure. Cleanup errors after a
|
||||||
// failed lifecycle stage must not mask the actionable runtime error.
|
// failed lifecycle stage must not mask the actionable runtime error.
|
||||||
log.warn("agent harness cleanup failed after attempt failure", {
|
log.warn("agent harness cleanup failed after attempt failure", {
|
||||||
@@ -128,9 +226,30 @@ export async function runAgentHarnessV2LifecycleAttempt(
|
|||||||
originalError: formatErrorMessage(error),
|
originalError: formatErrorMessage(error),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
emitAgentHarnessRunError({
|
||||||
|
harness,
|
||||||
|
attemptParams: params,
|
||||||
|
startedAt,
|
||||||
|
phase,
|
||||||
|
error,
|
||||||
|
cleanupFailed,
|
||||||
|
});
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
await harness.cleanup({ prepared, session, result });
|
try {
|
||||||
|
phase = "cleanup";
|
||||||
|
await harness.cleanup({ prepared, session, result });
|
||||||
|
} catch (error) {
|
||||||
|
emitAgentHarnessRunError({
|
||||||
|
harness,
|
||||||
|
attemptParams: params,
|
||||||
|
startedAt,
|
||||||
|
phase,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
emitAgentHarnessRunCompleted({ harness, attemptParams: params, result, startedAt });
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -256,6 +256,47 @@ export type DiagnosticRunCompletedEvent = DiagnosticRunBaseEvent & {
|
|||||||
errorCategory?: string;
|
errorCategory?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type DiagnosticHarnessRunPhase = "prepare" | "start" | "send" | "resolve" | "cleanup";
|
||||||
|
export type DiagnosticHarnessRunOutcome = "completed" | "aborted" | "timed_out" | "error";
|
||||||
|
|
||||||
|
type DiagnosticHarnessRunBaseEvent = DiagnosticBaseEvent & {
|
||||||
|
type: "harness.run.started" | "harness.run.completed" | "harness.run.error";
|
||||||
|
runId: string;
|
||||||
|
sessionKey?: string;
|
||||||
|
sessionId?: string;
|
||||||
|
provider?: string;
|
||||||
|
model?: string;
|
||||||
|
trigger?: string;
|
||||||
|
channel?: string;
|
||||||
|
harnessId: string;
|
||||||
|
pluginId?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DiagnosticHarnessRunStartedEvent = DiagnosticHarnessRunBaseEvent & {
|
||||||
|
type: "harness.run.started";
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DiagnosticHarnessRunCompletedEvent = DiagnosticHarnessRunBaseEvent & {
|
||||||
|
type: "harness.run.completed";
|
||||||
|
durationMs: number;
|
||||||
|
outcome: DiagnosticHarnessRunOutcome;
|
||||||
|
resultClassification?: "empty" | "reasoning-only" | "planning-only";
|
||||||
|
yieldDetected?: boolean;
|
||||||
|
itemLifecycle?: {
|
||||||
|
startedCount: number;
|
||||||
|
completedCount: number;
|
||||||
|
activeCount: number;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
export type DiagnosticHarnessRunErrorEvent = DiagnosticHarnessRunBaseEvent & {
|
||||||
|
type: "harness.run.error";
|
||||||
|
durationMs: number;
|
||||||
|
phase: DiagnosticHarnessRunPhase;
|
||||||
|
errorCategory: string;
|
||||||
|
cleanupFailed?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & {
|
type DiagnosticModelCallBaseEvent = DiagnosticBaseEvent & {
|
||||||
type: "model.call.started" | "model.call.completed" | "model.call.error";
|
type: "model.call.started" | "model.call.completed" | "model.call.error";
|
||||||
runId: string;
|
runId: string;
|
||||||
@@ -392,6 +433,9 @@ export type DiagnosticEventPayload =
|
|||||||
| DiagnosticExecProcessCompletedEvent
|
| DiagnosticExecProcessCompletedEvent
|
||||||
| DiagnosticRunStartedEvent
|
| DiagnosticRunStartedEvent
|
||||||
| DiagnosticRunCompletedEvent
|
| DiagnosticRunCompletedEvent
|
||||||
|
| DiagnosticHarnessRunStartedEvent
|
||||||
|
| DiagnosticHarnessRunCompletedEvent
|
||||||
|
| DiagnosticHarnessRunErrorEvent
|
||||||
| DiagnosticModelCallStartedEvent
|
| DiagnosticModelCallStartedEvent
|
||||||
| DiagnosticModelCallCompletedEvent
|
| DiagnosticModelCallCompletedEvent
|
||||||
| DiagnosticModelCallErrorEvent
|
| DiagnosticModelCallErrorEvent
|
||||||
@@ -446,6 +490,9 @@ const ASYNC_DIAGNOSTIC_EVENT_TYPES = new Set<DiagnosticEventPayload["type"]>([
|
|||||||
"model.call.started",
|
"model.call.started",
|
||||||
"model.call.completed",
|
"model.call.completed",
|
||||||
"model.call.error",
|
"model.call.error",
|
||||||
|
"harness.run.started",
|
||||||
|
"harness.run.completed",
|
||||||
|
"harness.run.error",
|
||||||
"context.assembled",
|
"context.assembled",
|
||||||
"log.record",
|
"log.record",
|
||||||
]);
|
]);
|
||||||
|
|||||||
@@ -305,6 +305,34 @@ function sanitizeDiagnosticEvent(event: DiagnosticEventPayload): DiagnosticStabi
|
|||||||
record.outcome = event.outcome;
|
record.outcome = event.outcome;
|
||||||
assignReasonCode(record, event.errorCategory);
|
assignReasonCode(record, event.errorCategory);
|
||||||
break;
|
break;
|
||||||
|
case "harness.run.started":
|
||||||
|
record.source = event.harnessId;
|
||||||
|
record.pluginId = event.pluginId;
|
||||||
|
record.provider = event.provider;
|
||||||
|
record.model = event.model;
|
||||||
|
record.channel = event.channel;
|
||||||
|
break;
|
||||||
|
case "harness.run.completed":
|
||||||
|
record.source = event.harnessId;
|
||||||
|
record.pluginId = event.pluginId;
|
||||||
|
record.provider = event.provider;
|
||||||
|
record.model = event.model;
|
||||||
|
record.channel = event.channel;
|
||||||
|
record.durationMs = event.durationMs;
|
||||||
|
record.outcome = event.outcome;
|
||||||
|
record.count = event.itemLifecycle?.completedCount;
|
||||||
|
break;
|
||||||
|
case "harness.run.error":
|
||||||
|
record.source = event.harnessId;
|
||||||
|
record.pluginId = event.pluginId;
|
||||||
|
record.provider = event.provider;
|
||||||
|
record.model = event.model;
|
||||||
|
record.channel = event.channel;
|
||||||
|
record.durationMs = event.durationMs;
|
||||||
|
record.outcome = "error";
|
||||||
|
record.action = event.phase;
|
||||||
|
assignReasonCode(record, event.errorCategory);
|
||||||
|
break;
|
||||||
case "model.call.started":
|
case "model.call.started":
|
||||||
record.provider = event.provider;
|
record.provider = event.provider;
|
||||||
record.model = event.model;
|
record.model = event.model;
|
||||||
|
|||||||
Reference in New Issue
Block a user