mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-23 23:22:32 +00:00
benchmarks: add agentic parity report gate
This commit is contained in:
@@ -108,6 +108,12 @@ PR D is the proof layer. It should not be the reason runtime-correctness PRs are
|
||||
- reports are readable by humans and automation
|
||||
- parity claims are evidence-backed, not anecdotal
|
||||
|
||||
Expected artifacts from PR D:
|
||||
|
||||
- `qa-suite-report.md` / `qa-suite-summary.json` for each model run
|
||||
- `qa-agentic-parity-report.md` with aggregate and scenario-level comparison
|
||||
- `qa-agentic-parity-summary.json` with a machine-readable verdict
|
||||
|
||||
## Release gate
|
||||
|
||||
Do not claim GPT-5.4 parity or superiority over Opus 4.6 until:
|
||||
|
||||
@@ -48,6 +48,22 @@ This slice adds the first-wave QA-lab parity pack so GPT-5.4 and Opus 4.6 can be
|
||||
|
||||
The parity pack is the proof layer. It does not change runtime behavior by itself.
|
||||
|
||||
After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:
|
||||
|
||||
```bash
|
||||
pnpm qa parity-report \
|
||||
--repo-root . \
|
||||
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
|
||||
--output-dir .artifacts/qa-e2e/parity
|
||||
```
|
||||
|
||||
That command writes:
|
||||
|
||||
- a human-readable Markdown report
|
||||
- a machine-readable JSON verdict
|
||||
- an explicit `pass` / `fail` gate result
|
||||
|
||||
## Why this improves GPT-5.4 in practice
|
||||
|
||||
Before this work, GPT-5.4 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models:
|
||||
@@ -116,6 +132,13 @@ Required outcomes:
|
||||
- no silent replay or compaction abandonment
|
||||
- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline
|
||||
|
||||
For the first-wave harness, the gate compares:
|
||||
|
||||
- completion rate
|
||||
- unintended-stop rate
|
||||
- valid-tool-call rate
|
||||
- fake-success count
|
||||
|
||||
## Who should enable `strict-agentic`
|
||||
|
||||
Use `strict-agentic` when:
|
||||
|
||||
79
extensions/qa-lab/src/agentic-parity-report.test.ts
Normal file
79
extensions/qa-lab/src/agentic-parity-report.test.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildQaAgenticParityComparison,
|
||||
computeQaAgenticParityMetrics,
|
||||
renderQaAgenticParityMarkdownReport,
|
||||
type QaParitySuiteSummary,
|
||||
} from "./agentic-parity-report.js";
|
||||
|
||||
describe("qa agentic parity report", () => {
|
||||
it("computes first-wave parity metrics from suite summaries", () => {
|
||||
const summary: QaParitySuiteSummary = {
|
||||
scenarios: [
|
||||
{ name: "Scenario A", status: "pass" },
|
||||
{ name: "Scenario B", status: "fail", details: "incomplete turn detected" },
|
||||
],
|
||||
};
|
||||
|
||||
expect(computeQaAgenticParityMetrics(summary)).toEqual({
|
||||
totalScenarios: 2,
|
||||
passedScenarios: 1,
|
||||
failedScenarios: 1,
|
||||
completionRate: 0.5,
|
||||
unintendedStopCount: 1,
|
||||
unintendedStopRate: 0.5,
|
||||
validToolCallCount: 1,
|
||||
validToolCallRate: 0.5,
|
||||
fakeSuccessCount: 0,
|
||||
});
|
||||
});
|
||||
|
||||
it("fails the parity gate when the candidate regresses against baseline", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.4",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
candidateSummary: {
|
||||
scenarios: [
|
||||
{ name: "Scenario A", status: "pass" },
|
||||
{ name: "Scenario B", status: "fail", details: "timed out before it continued" },
|
||||
],
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: [
|
||||
{ name: "Scenario A", status: "pass" },
|
||||
{ name: "Scenario B", status: "pass" },
|
||||
],
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"openai/gpt-5.4 completion rate 50.0% is below anthropic/claude-opus-4-6 100.0%.",
|
||||
);
|
||||
expect(comparison.failures).toContain(
|
||||
"openai/gpt-5.4 unintended-stop rate 50.0% exceeds anthropic/claude-opus-4-6 0.0%.",
|
||||
);
|
||||
});
|
||||
|
||||
it("renders a readable markdown parity report", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.4",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
candidateSummary: {
|
||||
scenarios: [{ name: "Scenario A", status: "pass" }],
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: [{ name: "Scenario A", status: "pass" }],
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
});
|
||||
|
||||
const report = renderQaAgenticParityMarkdownReport(comparison);
|
||||
|
||||
expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
|
||||
expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
|
||||
expect(report).toContain("### Scenario A");
|
||||
expect(report).toContain("- Verdict: pass");
|
||||
});
|
||||
});
|
||||
259
extensions/qa-lab/src/agentic-parity-report.ts
Normal file
259
extensions/qa-lab/src/agentic-parity-report.ts
Normal file
@@ -0,0 +1,259 @@
|
||||
export type QaParityReportStep = {
|
||||
name: string;
|
||||
status: "pass" | "fail" | "skip";
|
||||
details?: string;
|
||||
};
|
||||
|
||||
export type QaParityReportScenario = {
|
||||
name: string;
|
||||
status: "pass" | "fail" | "skip";
|
||||
details?: string;
|
||||
steps?: QaParityReportStep[];
|
||||
};
|
||||
|
||||
export type QaParitySuiteSummary = {
|
||||
scenarios: QaParityReportScenario[];
|
||||
counts?: {
|
||||
total?: number;
|
||||
passed?: number;
|
||||
failed?: number;
|
||||
};
|
||||
};
|
||||
|
||||
export type QaAgenticParityMetrics = {
|
||||
totalScenarios: number;
|
||||
passedScenarios: number;
|
||||
failedScenarios: number;
|
||||
completionRate: number;
|
||||
unintendedStopCount: number;
|
||||
unintendedStopRate: number;
|
||||
validToolCallCount: number;
|
||||
validToolCallRate: number;
|
||||
fakeSuccessCount: number;
|
||||
};
|
||||
|
||||
export type QaAgenticParityScenarioComparison = {
|
||||
name: string;
|
||||
candidateStatus: "pass" | "fail" | "skip" | "missing";
|
||||
baselineStatus: "pass" | "fail" | "skip" | "missing";
|
||||
candidateDetails?: string;
|
||||
baselineDetails?: string;
|
||||
};
|
||||
|
||||
export type QaAgenticParityComparison = {
|
||||
candidateLabel: string;
|
||||
baselineLabel: string;
|
||||
comparedAt: string;
|
||||
candidateMetrics: QaAgenticParityMetrics;
|
||||
baselineMetrics: QaAgenticParityMetrics;
|
||||
scenarioComparisons: QaAgenticParityScenarioComparison[];
|
||||
pass: boolean;
|
||||
failures: string[];
|
||||
notes: string[];
|
||||
};
|
||||
|
||||
const UNINTENDED_STOP_PATTERNS = [
|
||||
/incomplete turn/i,
|
||||
/\btimed out\b/i,
|
||||
/\btimeout\b/i,
|
||||
/\bstopped\b/i,
|
||||
/\bblocked\b/i,
|
||||
/\babandoned\b/i,
|
||||
/did not continue/i,
|
||||
] as const;
|
||||
|
||||
const SUSPICIOUS_PASS_PATTERNS = [
|
||||
/incomplete turn/i,
|
||||
/\btimed out\b/i,
|
||||
/\btimeout\b/i,
|
||||
/\bblocked\b/i,
|
||||
/\berror\b/i,
|
||||
/\bfailed\b/i,
|
||||
] as const;
|
||||
|
||||
function normalizeScenarioStatus(status: string | undefined): "pass" | "fail" | "skip" {
|
||||
return status === "pass" || status === "fail" || status === "skip" ? status : "fail";
|
||||
}
|
||||
|
||||
function scenarioText(scenario: QaParityReportScenario) {
|
||||
const parts = [scenario.details ?? ""];
|
||||
for (const step of scenario.steps ?? []) {
|
||||
parts.push(step.details ?? "");
|
||||
}
|
||||
return parts.filter(Boolean).join("\n");
|
||||
}
|
||||
|
||||
function scenarioHasPattern(
|
||||
scenario: QaParityReportScenario,
|
||||
patterns: readonly RegExp[],
|
||||
): boolean {
|
||||
const text = scenarioText(scenario);
|
||||
return text.length > 0 && patterns.some((pattern) => pattern.test(text));
|
||||
}
|
||||
|
||||
export function computeQaAgenticParityMetrics(
|
||||
summary: QaParitySuiteSummary,
|
||||
): QaAgenticParityMetrics {
|
||||
const scenarios = summary.scenarios.map((scenario) => ({
|
||||
...scenario,
|
||||
status: normalizeScenarioStatus(scenario.status),
|
||||
}));
|
||||
const totalScenarios = summary.counts?.total ?? scenarios.length;
|
||||
const passedScenarios =
|
||||
summary.counts?.passed ?? scenarios.filter((scenario) => scenario.status === "pass").length;
|
||||
const failedScenarios =
|
||||
summary.counts?.failed ?? scenarios.filter((scenario) => scenario.status === "fail").length;
|
||||
const unintendedStopCount = scenarios.filter(
|
||||
(scenario) =>
|
||||
scenario.status !== "pass" && scenarioHasPattern(scenario, UNINTENDED_STOP_PATTERNS),
|
||||
).length;
|
||||
const fakeSuccessCount = scenarios.filter(
|
||||
(scenario) =>
|
||||
scenario.status === "pass" && scenarioHasPattern(scenario, SUSPICIOUS_PASS_PATTERNS),
|
||||
).length;
|
||||
|
||||
// First-wave parity scenarios are all tool-mediated tasks, so a passing scenario is our
|
||||
// verified unit of valid tool-backed execution in this harness.
|
||||
const validToolCallCount = passedScenarios;
|
||||
|
||||
const rate = (value: number) => (totalScenarios > 0 ? value / totalScenarios : 0);
|
||||
return {
|
||||
totalScenarios,
|
||||
passedScenarios,
|
||||
failedScenarios,
|
||||
completionRate: rate(passedScenarios),
|
||||
unintendedStopCount,
|
||||
unintendedStopRate: rate(unintendedStopCount),
|
||||
validToolCallCount,
|
||||
validToolCallRate: rate(validToolCallCount),
|
||||
fakeSuccessCount,
|
||||
};
|
||||
}
|
||||
|
||||
function formatPercent(value: number) {
|
||||
return `${(value * 100).toFixed(1)}%`;
|
||||
}
|
||||
|
||||
export function buildQaAgenticParityComparison(params: {
|
||||
candidateLabel: string;
|
||||
baselineLabel: string;
|
||||
candidateSummary: QaParitySuiteSummary;
|
||||
baselineSummary: QaParitySuiteSummary;
|
||||
comparedAt?: string;
|
||||
}): QaAgenticParityComparison {
|
||||
const candidateMetrics = computeQaAgenticParityMetrics(params.candidateSummary);
|
||||
const baselineMetrics = computeQaAgenticParityMetrics(params.baselineSummary);
|
||||
|
||||
const scenarioNames = new Set([
|
||||
...params.candidateSummary.scenarios.map((scenario) => scenario.name),
|
||||
...params.baselineSummary.scenarios.map((scenario) => scenario.name),
|
||||
]);
|
||||
const candidateByName = new Map(
|
||||
params.candidateSummary.scenarios.map((scenario) => [scenario.name, scenario]),
|
||||
);
|
||||
const baselineByName = new Map(
|
||||
params.baselineSummary.scenarios.map((scenario) => [scenario.name, scenario]),
|
||||
);
|
||||
|
||||
const scenarioComparisons = [...scenarioNames]
|
||||
.toSorted((left, right) => left.localeCompare(right))
|
||||
.map((name) => {
|
||||
const candidate = candidateByName.get(name);
|
||||
const baseline = baselineByName.get(name);
|
||||
return {
|
||||
name,
|
||||
candidateStatus: candidate ? normalizeScenarioStatus(candidate.status) : "missing",
|
||||
baselineStatus: baseline ? normalizeScenarioStatus(baseline.status) : "missing",
|
||||
...(candidate?.details ? { candidateDetails: candidate.details } : {}),
|
||||
...(baseline?.details ? { baselineDetails: baseline.details } : {}),
|
||||
} satisfies QaAgenticParityScenarioComparison;
|
||||
});
|
||||
|
||||
const failures: string[] = [];
|
||||
if (candidateMetrics.completionRate < baselineMetrics.completionRate) {
|
||||
failures.push(
|
||||
`${params.candidateLabel} completion rate ${formatPercent(candidateMetrics.completionRate)} is below ${params.baselineLabel} ${formatPercent(baselineMetrics.completionRate)}.`,
|
||||
);
|
||||
}
|
||||
if (candidateMetrics.unintendedStopRate > baselineMetrics.unintendedStopRate) {
|
||||
failures.push(
|
||||
`${params.candidateLabel} unintended-stop rate ${formatPercent(candidateMetrics.unintendedStopRate)} exceeds ${params.baselineLabel} ${formatPercent(baselineMetrics.unintendedStopRate)}.`,
|
||||
);
|
||||
}
|
||||
if (candidateMetrics.validToolCallRate < baselineMetrics.validToolCallRate) {
|
||||
failures.push(
|
||||
`${params.candidateLabel} valid-tool-call rate ${formatPercent(candidateMetrics.validToolCallRate)} is below ${params.baselineLabel} ${formatPercent(baselineMetrics.validToolCallRate)}.`,
|
||||
);
|
||||
}
|
||||
if (candidateMetrics.fakeSuccessCount > 0) {
|
||||
failures.push(
|
||||
`${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
candidateLabel: params.candidateLabel,
|
||||
baselineLabel: params.baselineLabel,
|
||||
comparedAt: params.comparedAt ?? new Date().toISOString(),
|
||||
candidateMetrics,
|
||||
baselineMetrics,
|
||||
scenarioComparisons,
|
||||
pass: failures.length === 0,
|
||||
failures,
|
||||
notes: [
|
||||
"First-wave valid-tool-call rate is scenario-level and uses passing tool-mediated scenarios as the verified numerator.",
|
||||
"Auth/proxy/DNS correctness is intentionally out of scope for this parity report and should be gated by the deterministic runtime-truthfulness suites.",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityComparison): string {
|
||||
const lines = [
|
||||
"# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report",
|
||||
"",
|
||||
`- Compared at: ${comparison.comparedAt}`,
|
||||
`- Candidate: ${comparison.candidateLabel}`,
|
||||
`- Baseline: ${comparison.baselineLabel}`,
|
||||
`- Verdict: ${comparison.pass ? "pass" : "fail"}`,
|
||||
"",
|
||||
"## Aggregate Metrics",
|
||||
"",
|
||||
"| Metric | Candidate | Baseline |",
|
||||
"| --- | ---: | ---: |",
|
||||
`| Completion rate | ${formatPercent(comparison.candidateMetrics.completionRate)} | ${formatPercent(comparison.baselineMetrics.completionRate)} |`,
|
||||
`| Unintended-stop rate | ${formatPercent(comparison.candidateMetrics.unintendedStopRate)} | ${formatPercent(comparison.baselineMetrics.unintendedStopRate)} |`,
|
||||
`| Valid-tool-call rate | ${formatPercent(comparison.candidateMetrics.validToolCallRate)} | ${formatPercent(comparison.baselineMetrics.validToolCallRate)} |`,
|
||||
`| Fake-success count | ${comparison.candidateMetrics.fakeSuccessCount} | ${comparison.baselineMetrics.fakeSuccessCount} |`,
|
||||
"",
|
||||
];
|
||||
|
||||
if (comparison.failures.length > 0) {
|
||||
lines.push("## Gate Failures", "");
|
||||
for (const failure of comparison.failures) {
|
||||
lines.push(`- ${failure}`);
|
||||
}
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("## Scenario Comparison", "");
|
||||
for (const scenario of comparison.scenarioComparisons) {
|
||||
lines.push(`### ${scenario.name}`, "");
|
||||
lines.push(`- ${comparison.candidateLabel}: ${scenario.candidateStatus}`);
|
||||
lines.push(`- ${comparison.baselineLabel}: ${scenario.baselineStatus}`);
|
||||
if (scenario.candidateDetails) {
|
||||
lines.push(`- ${comparison.candidateLabel} details: ${scenario.candidateDetails}`);
|
||||
}
|
||||
if (scenario.baselineDetails) {
|
||||
lines.push(`- ${comparison.baselineLabel} details: ${scenario.baselineDetails}`);
|
||||
}
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("## Notes", "");
|
||||
for (const note of comparison.notes) {
|
||||
lines.push(`- ${note}`);
|
||||
}
|
||||
lines.push("");
|
||||
|
||||
return lines.join("\n");
|
||||
}
|
||||
@@ -1,4 +1,10 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import {
|
||||
buildQaAgenticParityComparison,
|
||||
renderQaAgenticParityMarkdownReport,
|
||||
type QaParitySuiteSummary,
|
||||
} from "./agentic-parity-report.js";
|
||||
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
|
||||
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
|
||||
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
|
||||
@@ -287,6 +293,45 @@ export async function runQaSuiteCommand(opts: {
|
||||
process.stdout.write(`QA suite summary: ${result.summaryPath}\n`);
|
||||
}
|
||||
|
||||
export async function runQaParityReportCommand(opts: {
|
||||
repoRoot?: string;
|
||||
candidateSummary: string;
|
||||
baselineSummary: string;
|
||||
candidateLabel?: string;
|
||||
baselineLabel?: string;
|
||||
outputDir?: string;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const outputDir =
|
||||
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
|
||||
path.join(repoRoot, ".artifacts", "qa-e2e", `parity-${Date.now().toString(36)}`);
|
||||
await fs.mkdir(outputDir, { recursive: true });
|
||||
|
||||
const candidateSummaryPath = path.resolve(repoRoot, opts.candidateSummary);
|
||||
const baselineSummaryPath = path.resolve(repoRoot, opts.baselineSummary);
|
||||
const candidateSummary = JSON.parse(
|
||||
await fs.readFile(candidateSummaryPath, "utf8"),
|
||||
) as QaParitySuiteSummary;
|
||||
const baselineSummary = JSON.parse(
|
||||
await fs.readFile(baselineSummaryPath, "utf8"),
|
||||
) as QaParitySuiteSummary;
|
||||
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: opts.candidateLabel?.trim() || "openai/gpt-5.4",
|
||||
baselineLabel: opts.baselineLabel?.trim() || "anthropic/claude-opus-4-6",
|
||||
candidateSummary,
|
||||
baselineSummary,
|
||||
});
|
||||
const report = renderQaAgenticParityMarkdownReport(comparison);
|
||||
const reportPath = path.join(outputDir, "qa-agentic-parity-report.md");
|
||||
const summaryPath = path.join(outputDir, "qa-agentic-parity-summary.json");
|
||||
await fs.writeFile(reportPath, report, "utf8");
|
||||
await fs.writeFile(summaryPath, `${JSON.stringify(comparison, null, 2)}\n`, "utf8");
|
||||
|
||||
process.stdout.write(`QA parity report: ${reportPath}\n`);
|
||||
process.stdout.write(`QA parity summary: ${summaryPath}\n`);
|
||||
process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
|
||||
}
|
||||
export async function runQaCharacterEvalCommand(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
|
||||
@@ -39,6 +39,17 @@ async function runQaSuite(opts: {
|
||||
await runtime.runQaSuiteCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaParityReport(opts: {
|
||||
repoRoot?: string;
|
||||
candidateSummary: string;
|
||||
baselineSummary: string;
|
||||
candidateLabel?: string;
|
||||
baselineLabel?: string;
|
||||
outputDir?: string;
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaParityReportCommand(opts);
|
||||
}
|
||||
async function runQaCharacterEval(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
@@ -208,6 +219,27 @@ export function registerQaLabCli(program: Command) {
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("parity-report")
|
||||
.description("Compare two QA suite summaries and write an agentic parity gate report")
|
||||
.requiredOption("--candidate-summary <path>", "Candidate qa-suite-summary.json path")
|
||||
.requiredOption("--baseline-summary <path>", "Baseline qa-suite-summary.json path")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
.option("--candidate-label <label>", "Candidate display label", "openai/gpt-5.4")
|
||||
.option("--baseline-label <label>", "Baseline display label", "anthropic/claude-opus-4-6")
|
||||
.option("--output-dir <path>", "Artifact directory for the parity report")
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
candidateSummary: string;
|
||||
baselineSummary: string;
|
||||
candidateLabel?: string;
|
||||
baselineLabel?: string;
|
||||
outputDir?: string;
|
||||
}) => {
|
||||
await runQaParityReport(opts);
|
||||
},
|
||||
);
|
||||
|
||||
for (const lane of LIVE_TRANSPORT_QA_CLI_REGISTRATIONS) {
|
||||
lane.register(qa);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user