benchmarks: add agentic parity report gate

This commit is contained in:
Eva
2026-04-11 01:51:16 +07:00
committed by Peter Steinberger
parent 79f539d9ce
commit 67fdd3b4df
6 changed files with 444 additions and 0 deletions

View File

@@ -108,6 +108,12 @@ PR D is the proof layer. It should not be the reason runtime-correctness PRs are
- reports are readable by humans and automation
- parity claims are evidence-backed, not anecdotal
Expected artifacts from PR D:
- `qa-suite-report.md` / `qa-suite-summary.json` for each model run
- `qa-agentic-parity-report.md` with aggregate and scenario-level comparison
- `qa-agentic-parity-summary.json` with a machine-readable verdict
## Release gate
Do not claim GPT-5.4 parity or superiority over Opus 4.6 until:

View File

@@ -48,6 +48,22 @@ This slice adds the first-wave QA-lab parity pack so GPT-5.4 and Opus 4.6 can be
The parity pack is the proof layer. It does not change runtime behavior by itself.
After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:
```bash
pnpm qa parity-report \
--repo-root . \
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
--output-dir .artifacts/qa-e2e/parity
```
That command writes:
- a human-readable Markdown report
- a machine-readable JSON verdict
- an explicit `pass` / `fail` gate result
## Why this improves GPT-5.4 in practice
Before this work, GPT-5.4 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models:
@@ -116,6 +132,13 @@ Required outcomes:
- no silent replay or compaction abandonment
- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline
For the first-wave harness, the gate compares:
- completion rate
- unintended-stop rate
- valid-tool-call rate
- fake-success count
## Who should enable `strict-agentic`
Use `strict-agentic` when:

View File

@@ -0,0 +1,79 @@
import { describe, expect, it } from "vitest";
import {
buildQaAgenticParityComparison,
computeQaAgenticParityMetrics,
renderQaAgenticParityMarkdownReport,
type QaParitySuiteSummary,
} from "./agentic-parity-report.js";
describe("qa agentic parity report", () => {
it("computes first-wave parity metrics from suite summaries", () => {
const summary: QaParitySuiteSummary = {
scenarios: [
{ name: "Scenario A", status: "pass" },
{ name: "Scenario B", status: "fail", details: "incomplete turn detected" },
],
};
expect(computeQaAgenticParityMetrics(summary)).toEqual({
totalScenarios: 2,
passedScenarios: 1,
failedScenarios: 1,
completionRate: 0.5,
unintendedStopCount: 1,
unintendedStopRate: 0.5,
validToolCallCount: 1,
validToolCallRate: 0.5,
fakeSuccessCount: 0,
});
});
it("fails the parity gate when the candidate regresses against baseline", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: [
{ name: "Scenario A", status: "pass" },
{ name: "Scenario B", status: "fail", details: "timed out before it continued" },
],
},
baselineSummary: {
scenarios: [
{ name: "Scenario A", status: "pass" },
{ name: "Scenario B", status: "pass" },
],
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toContain(
"openai/gpt-5.4 completion rate 50.0% is below anthropic/claude-opus-4-6 100.0%.",
);
expect(comparison.failures).toContain(
"openai/gpt-5.4 unintended-stop rate 50.0% exceeds anthropic/claude-opus-4-6 0.0%.",
);
});
it("renders a readable markdown parity report", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: [{ name: "Scenario A", status: "pass" }],
},
baselineSummary: {
scenarios: [{ name: "Scenario A", status: "pass" }],
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
const report = renderQaAgenticParityMarkdownReport(comparison);
expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
expect(report).toContain("### Scenario A");
expect(report).toContain("- Verdict: pass");
});
});

View File

@@ -0,0 +1,259 @@
export type QaParityReportStep = {
name: string;
status: "pass" | "fail" | "skip";
details?: string;
};
export type QaParityReportScenario = {
name: string;
status: "pass" | "fail" | "skip";
details?: string;
steps?: QaParityReportStep[];
};
export type QaParitySuiteSummary = {
scenarios: QaParityReportScenario[];
counts?: {
total?: number;
passed?: number;
failed?: number;
};
};
export type QaAgenticParityMetrics = {
totalScenarios: number;
passedScenarios: number;
failedScenarios: number;
completionRate: number;
unintendedStopCount: number;
unintendedStopRate: number;
validToolCallCount: number;
validToolCallRate: number;
fakeSuccessCount: number;
};
export type QaAgenticParityScenarioComparison = {
name: string;
candidateStatus: "pass" | "fail" | "skip" | "missing";
baselineStatus: "pass" | "fail" | "skip" | "missing";
candidateDetails?: string;
baselineDetails?: string;
};
export type QaAgenticParityComparison = {
candidateLabel: string;
baselineLabel: string;
comparedAt: string;
candidateMetrics: QaAgenticParityMetrics;
baselineMetrics: QaAgenticParityMetrics;
scenarioComparisons: QaAgenticParityScenarioComparison[];
pass: boolean;
failures: string[];
notes: string[];
};
const UNINTENDED_STOP_PATTERNS = [
/incomplete turn/i,
/\btimed out\b/i,
/\btimeout\b/i,
/\bstopped\b/i,
/\bblocked\b/i,
/\babandoned\b/i,
/did not continue/i,
] as const;
const SUSPICIOUS_PASS_PATTERNS = [
/incomplete turn/i,
/\btimed out\b/i,
/\btimeout\b/i,
/\bblocked\b/i,
/\berror\b/i,
/\bfailed\b/i,
] as const;
function normalizeScenarioStatus(status: string | undefined): "pass" | "fail" | "skip" {
return status === "pass" || status === "fail" || status === "skip" ? status : "fail";
}
function scenarioText(scenario: QaParityReportScenario) {
const parts = [scenario.details ?? ""];
for (const step of scenario.steps ?? []) {
parts.push(step.details ?? "");
}
return parts.filter(Boolean).join("\n");
}
function scenarioHasPattern(
scenario: QaParityReportScenario,
patterns: readonly RegExp[],
): boolean {
const text = scenarioText(scenario);
return text.length > 0 && patterns.some((pattern) => pattern.test(text));
}
export function computeQaAgenticParityMetrics(
summary: QaParitySuiteSummary,
): QaAgenticParityMetrics {
const scenarios = summary.scenarios.map((scenario) => ({
...scenario,
status: normalizeScenarioStatus(scenario.status),
}));
const totalScenarios = summary.counts?.total ?? scenarios.length;
const passedScenarios =
summary.counts?.passed ?? scenarios.filter((scenario) => scenario.status === "pass").length;
const failedScenarios =
summary.counts?.failed ?? scenarios.filter((scenario) => scenario.status === "fail").length;
const unintendedStopCount = scenarios.filter(
(scenario) =>
scenario.status !== "pass" && scenarioHasPattern(scenario, UNINTENDED_STOP_PATTERNS),
).length;
const fakeSuccessCount = scenarios.filter(
(scenario) =>
scenario.status === "pass" && scenarioHasPattern(scenario, SUSPICIOUS_PASS_PATTERNS),
).length;
// First-wave parity scenarios are all tool-mediated tasks, so a passing scenario is our
// verified unit of valid tool-backed execution in this harness.
const validToolCallCount = passedScenarios;
const rate = (value: number) => (totalScenarios > 0 ? value / totalScenarios : 0);
return {
totalScenarios,
passedScenarios,
failedScenarios,
completionRate: rate(passedScenarios),
unintendedStopCount,
unintendedStopRate: rate(unintendedStopCount),
validToolCallCount,
validToolCallRate: rate(validToolCallCount),
fakeSuccessCount,
};
}
function formatPercent(value: number) {
return `${(value * 100).toFixed(1)}%`;
}
export function buildQaAgenticParityComparison(params: {
candidateLabel: string;
baselineLabel: string;
candidateSummary: QaParitySuiteSummary;
baselineSummary: QaParitySuiteSummary;
comparedAt?: string;
}): QaAgenticParityComparison {
const candidateMetrics = computeQaAgenticParityMetrics(params.candidateSummary);
const baselineMetrics = computeQaAgenticParityMetrics(params.baselineSummary);
const scenarioNames = new Set([
...params.candidateSummary.scenarios.map((scenario) => scenario.name),
...params.baselineSummary.scenarios.map((scenario) => scenario.name),
]);
const candidateByName = new Map(
params.candidateSummary.scenarios.map((scenario) => [scenario.name, scenario]),
);
const baselineByName = new Map(
params.baselineSummary.scenarios.map((scenario) => [scenario.name, scenario]),
);
const scenarioComparisons = [...scenarioNames]
.toSorted((left, right) => left.localeCompare(right))
.map((name) => {
const candidate = candidateByName.get(name);
const baseline = baselineByName.get(name);
return {
name,
candidateStatus: candidate ? normalizeScenarioStatus(candidate.status) : "missing",
baselineStatus: baseline ? normalizeScenarioStatus(baseline.status) : "missing",
...(candidate?.details ? { candidateDetails: candidate.details } : {}),
...(baseline?.details ? { baselineDetails: baseline.details } : {}),
} satisfies QaAgenticParityScenarioComparison;
});
const failures: string[] = [];
if (candidateMetrics.completionRate < baselineMetrics.completionRate) {
failures.push(
`${params.candidateLabel} completion rate ${formatPercent(candidateMetrics.completionRate)} is below ${params.baselineLabel} ${formatPercent(baselineMetrics.completionRate)}.`,
);
}
if (candidateMetrics.unintendedStopRate > baselineMetrics.unintendedStopRate) {
failures.push(
`${params.candidateLabel} unintended-stop rate ${formatPercent(candidateMetrics.unintendedStopRate)} exceeds ${params.baselineLabel} ${formatPercent(baselineMetrics.unintendedStopRate)}.`,
);
}
if (candidateMetrics.validToolCallRate < baselineMetrics.validToolCallRate) {
failures.push(
`${params.candidateLabel} valid-tool-call rate ${formatPercent(candidateMetrics.validToolCallRate)} is below ${params.baselineLabel} ${formatPercent(baselineMetrics.validToolCallRate)}.`,
);
}
if (candidateMetrics.fakeSuccessCount > 0) {
failures.push(
`${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
);
}
return {
candidateLabel: params.candidateLabel,
baselineLabel: params.baselineLabel,
comparedAt: params.comparedAt ?? new Date().toISOString(),
candidateMetrics,
baselineMetrics,
scenarioComparisons,
pass: failures.length === 0,
failures,
notes: [
"First-wave valid-tool-call rate is scenario-level and uses passing tool-mediated scenarios as the verified numerator.",
"Auth/proxy/DNS correctness is intentionally out of scope for this parity report and should be gated by the deterministic runtime-truthfulness suites.",
],
};
}
export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityComparison): string {
const lines = [
"# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report",
"",
`- Compared at: ${comparison.comparedAt}`,
`- Candidate: ${comparison.candidateLabel}`,
`- Baseline: ${comparison.baselineLabel}`,
`- Verdict: ${comparison.pass ? "pass" : "fail"}`,
"",
"## Aggregate Metrics",
"",
"| Metric | Candidate | Baseline |",
"| --- | ---: | ---: |",
`| Completion rate | ${formatPercent(comparison.candidateMetrics.completionRate)} | ${formatPercent(comparison.baselineMetrics.completionRate)} |`,
`| Unintended-stop rate | ${formatPercent(comparison.candidateMetrics.unintendedStopRate)} | ${formatPercent(comparison.baselineMetrics.unintendedStopRate)} |`,
`| Valid-tool-call rate | ${formatPercent(comparison.candidateMetrics.validToolCallRate)} | ${formatPercent(comparison.baselineMetrics.validToolCallRate)} |`,
`| Fake-success count | ${comparison.candidateMetrics.fakeSuccessCount} | ${comparison.baselineMetrics.fakeSuccessCount} |`,
"",
];
if (comparison.failures.length > 0) {
lines.push("## Gate Failures", "");
for (const failure of comparison.failures) {
lines.push(`- ${failure}`);
}
lines.push("");
}
lines.push("## Scenario Comparison", "");
for (const scenario of comparison.scenarioComparisons) {
lines.push(`### ${scenario.name}`, "");
lines.push(`- ${comparison.candidateLabel}: ${scenario.candidateStatus}`);
lines.push(`- ${comparison.baselineLabel}: ${scenario.baselineStatus}`);
if (scenario.candidateDetails) {
lines.push(`- ${comparison.candidateLabel} details: ${scenario.candidateDetails}`);
}
if (scenario.baselineDetails) {
lines.push(`- ${comparison.baselineLabel} details: ${scenario.baselineDetails}`);
}
lines.push("");
}
lines.push("## Notes", "");
for (const note of comparison.notes) {
lines.push(`- ${note}`);
}
lines.push("");
return lines.join("\n");
}

View File

@@ -1,4 +1,10 @@
import fs from "node:fs/promises";
import path from "node:path";
import {
buildQaAgenticParityComparison,
renderQaAgenticParityMarkdownReport,
type QaParitySuiteSummary,
} from "./agentic-parity-report.js";
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
@@ -287,6 +293,45 @@ export async function runQaSuiteCommand(opts: {
process.stdout.write(`QA suite summary: ${result.summaryPath}\n`);
}
export async function runQaParityReportCommand(opts: {
repoRoot?: string;
candidateSummary: string;
baselineSummary: string;
candidateLabel?: string;
baselineLabel?: string;
outputDir?: string;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const outputDir =
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
path.join(repoRoot, ".artifacts", "qa-e2e", `parity-${Date.now().toString(36)}`);
await fs.mkdir(outputDir, { recursive: true });
const candidateSummaryPath = path.resolve(repoRoot, opts.candidateSummary);
const baselineSummaryPath = path.resolve(repoRoot, opts.baselineSummary);
const candidateSummary = JSON.parse(
await fs.readFile(candidateSummaryPath, "utf8"),
) as QaParitySuiteSummary;
const baselineSummary = JSON.parse(
await fs.readFile(baselineSummaryPath, "utf8"),
) as QaParitySuiteSummary;
const comparison = buildQaAgenticParityComparison({
candidateLabel: opts.candidateLabel?.trim() || "openai/gpt-5.4",
baselineLabel: opts.baselineLabel?.trim() || "anthropic/claude-opus-4-6",
candidateSummary,
baselineSummary,
});
const report = renderQaAgenticParityMarkdownReport(comparison);
const reportPath = path.join(outputDir, "qa-agentic-parity-report.md");
const summaryPath = path.join(outputDir, "qa-agentic-parity-summary.json");
await fs.writeFile(reportPath, report, "utf8");
await fs.writeFile(summaryPath, `${JSON.stringify(comparison, null, 2)}\n`, "utf8");
process.stdout.write(`QA parity report: ${reportPath}\n`);
process.stdout.write(`QA parity summary: ${summaryPath}\n`);
process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
}
export async function runQaCharacterEvalCommand(opts: {
repoRoot?: string;
outputDir?: string;

View File

@@ -39,6 +39,17 @@ async function runQaSuite(opts: {
await runtime.runQaSuiteCommand(opts);
}
async function runQaParityReport(opts: {
repoRoot?: string;
candidateSummary: string;
baselineSummary: string;
candidateLabel?: string;
baselineLabel?: string;
outputDir?: string;
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaParityReportCommand(opts);
}
async function runQaCharacterEval(opts: {
repoRoot?: string;
outputDir?: string;
@@ -208,6 +219,27 @@ export function registerQaLabCli(program: Command) {
},
);
qa.command("parity-report")
.description("Compare two QA suite summaries and write an agentic parity gate report")
.requiredOption("--candidate-summary <path>", "Candidate qa-suite-summary.json path")
.requiredOption("--baseline-summary <path>", "Baseline qa-suite-summary.json path")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
.option("--candidate-label <label>", "Candidate display label", "openai/gpt-5.4")
.option("--baseline-label <label>", "Baseline display label", "anthropic/claude-opus-4-6")
.option("--output-dir <path>", "Artifact directory for the parity report")
.action(
async (opts: {
repoRoot?: string;
candidateSummary: string;
baselineSummary: string;
candidateLabel?: string;
baselineLabel?: string;
outputDir?: string;
}) => {
await runQaParityReport(opts);
},
);
for (const lane of LIVE_TRANSPORT_QA_CLI_REGISTRATIONS) {
lane.register(qa);
}