fix: harden parity gate review findings

This commit is contained in:
Eva
2026-04-11 04:17:57 +07:00
committed by Peter Steinberger
parent c73d005c7a
commit 55df6f11a4
6 changed files with 145 additions and 10 deletions

View File

@@ -51,7 +51,7 @@ The parity pack is the proof layer. It does not change runtime behavior by itsel
After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:
```bash
pnpm qa parity-report \
pnpm openclaw qa parity-report \
--repo-root . \
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
@@ -118,7 +118,7 @@ flowchart LR
A --> C["Run Opus 4.6 parity pack"]
B --> D["qa-suite-summary.json"]
C --> E["qa-suite-summary.json"]
D --> F["qa parity-report"]
D --> F["openclaw qa parity-report"]
E --> F
F --> G["qa-agentic-parity-report.md"]
F --> H["qa-agentic-parity-summary.json"]

View File

@@ -78,15 +78,77 @@ describe("qa agentic parity report", () => {
);
});
it("fails the parity gate when required first-wave scenarios are missing on both sides", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
},
baselineSummary: {
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toContain(
"Missing required first-wave parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.",
);
});
it("fails the parity gate when the baseline contains suspicious pass results", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: [
{ name: "Approval turn tool followthrough", status: "pass" },
{ name: "Model switch with tool continuity", status: "pass" },
{ name: "Source and docs discovery report", status: "pass" },
{ name: "Image understanding from attachment", status: "pass" },
],
},
baselineSummary: {
scenarios: [
{
name: "Approval turn tool followthrough",
status: "pass",
details: "timed out before it continued",
},
{ name: "Model switch with tool continuity", status: "pass" },
{ name: "Source and docs discovery report", status: "pass" },
{ name: "Image understanding from attachment", status: "pass" },
],
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toContain(
"anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
);
});
it("renders a readable markdown parity report", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: [{ name: "Scenario A", status: "pass" }],
scenarios: [
{ name: "Approval turn tool followthrough", status: "pass" },
{ name: "Model switch with tool continuity", status: "pass" },
{ name: "Source and docs discovery report", status: "pass" },
{ name: "Image understanding from attachment", status: "pass" },
],
},
baselineSummary: {
scenarios: [{ name: "Scenario A", status: "pass" }],
scenarios: [
{ name: "Approval turn tool followthrough", status: "pass" },
{ name: "Model switch with tool continuity", status: "pass" },
{ name: "Source and docs discovery report", status: "pass" },
{ name: "Image understanding from attachment", status: "pass" },
],
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
@@ -95,7 +157,7 @@ describe("qa agentic parity report", () => {
expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
expect(report).toContain("### Scenario A");
expect(report).toContain("### Approval turn tool followthrough");
expect(report).toContain("- Verdict: pass");
});
});

View File

@@ -1,3 +1,5 @@
import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js";
export type QaParityReportStep = {
name: string;
status: "pass" | "fail" | "skip";
@@ -170,6 +172,14 @@ export function buildQaAgenticParityComparison(params: {
});
const failures: string[] = [];
const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.filter(
(name) => !candidateByName.has(name) || !baselineByName.has(name),
);
for (const name of requiredScenarioCoverage) {
failures.push(
`Missing required first-wave parity scenario coverage for ${name}: ${params.candidateLabel}=${candidateByName.has(name) ? "present" : "missing"}, ${params.baselineLabel}=${baselineByName.has(name) ? "present" : "missing"}.`,
);
}
const coverageMismatch = scenarioComparisons.filter(
(scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing",
);
@@ -198,6 +208,11 @@ export function buildQaAgenticParityComparison(params: {
`${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
);
}
if (baselineMetrics.fakeSuccessCount > 0) {
failures.push(
`${params.baselineLabel} produced ${baselineMetrics.fakeSuccessCount} suspicious pass result(s); baseline fake-success count must also be 0.`,
);
}
return {
candidateLabel: params.candidateLabel,

View File

@@ -1,12 +1,29 @@
export const QA_AGENTIC_PARITY_PACK = "agentic";
export const QA_AGENTIC_PARITY_SCENARIO_IDS = [
"approval-turn-tool-followthrough",
"model-switch-tool-continuity",
"source-docs-discovery-report",
"image-understanding-attachment",
export const QA_AGENTIC_PARITY_SCENARIOS = [
{
id: "approval-turn-tool-followthrough",
title: "Approval turn tool followthrough",
},
{
id: "model-switch-tool-continuity",
title: "Model switch with tool continuity",
},
{
id: "source-docs-discovery-report",
title: "Source and docs discovery report",
},
{
id: "image-understanding-attachment",
title: "Image understanding from attachment",
},
] as const;
export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id);
export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map(
({ title }) => title,
);
export function resolveQaParityPackScenarioIds(params: {
parityPack?: string;
scenarioIds?: string[];

View File

@@ -1,3 +1,5 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
@@ -70,6 +72,7 @@ import {
runQaDockerUpCommand,
runQaCharacterEvalCommand,
runQaManualLaneCommand,
runQaParityReportCommand,
runQaSuiteCommand,
} from "./cli.runtime.js";
import { runQaMatrixCommand } from "./live-transports/matrix/cli.runtime.js";
@@ -344,6 +347,41 @@ describe("qa cli runtime", () => {
}),
).rejects.toThrow("--cli-auth-mode must be one of auto, api-key, subscription");
});
it("sets a failing exit code when the parity gate fails", async () => {
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-parity-"));
const priorExitCode = process.exitCode;
process.exitCode = undefined;
try {
await fs.writeFile(
path.join(repoRoot, "candidate.json"),
JSON.stringify({
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
}),
"utf8",
);
await fs.writeFile(
path.join(repoRoot, "baseline.json"),
JSON.stringify({
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
}),
"utf8",
);
await runQaParityReportCommand({
repoRoot,
candidateSummary: "candidate.json",
baselineSummary: "baseline.json",
});
expect(process.exitCode).toBe(1);
} finally {
process.exitCode = priorExitCode;
await fs.rm(repoRoot, { recursive: true, force: true });
}
});
it("resolves character eval paths and passes model refs through", async () => {
await runQaCharacterEvalCommand({
repoRoot: "/tmp/openclaw-repo",

View File

@@ -331,6 +331,9 @@ export async function runQaParityReportCommand(opts: {
process.stdout.write(`QA parity report: ${reportPath}\n`);
process.stdout.write(`QA parity summary: ${summaryPath}\n`);
process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
if (!comparison.pass) {
process.exitCode = 1;
}
}
export async function runQaCharacterEvalCommand(opts: {
repoRoot?: string;