From 55df6f11a4e3b4e3c74b84ebdbd5a4262288153b Mon Sep 17 00:00:00 2001 From: Eva Date: Sat, 11 Apr 2026 04:17:57 +0700 Subject: [PATCH] fix: harden parity gate review findings --- docs/help/gpt54-codex-agentic-parity.md | 4 +- .../qa-lab/src/agentic-parity-report.test.ts | 68 ++++++++++++++++++- .../qa-lab/src/agentic-parity-report.ts | 15 ++++ extensions/qa-lab/src/agentic-parity.ts | 27 ++++++-- extensions/qa-lab/src/cli.runtime.test.ts | 38 +++++++++++ extensions/qa-lab/src/cli.runtime.ts | 3 + 6 files changed, 145 insertions(+), 10 deletions(-) diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt54-codex-agentic-parity.md index 6f171a60334..62ac1d40b5c 100644 --- a/docs/help/gpt54-codex-agentic-parity.md +++ b/docs/help/gpt54-codex-agentic-parity.md @@ -51,7 +51,7 @@ The parity pack is the proof layer. It does not change runtime behavior by itsel After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with: ```bash -pnpm qa parity-report \ +pnpm openclaw qa parity-report \ --repo-root . \ --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ @@ -118,7 +118,7 @@ flowchart LR A --> C["Run Opus 4.6 parity pack"] B --> D["qa-suite-summary.json"] C --> E["qa-suite-summary.json"] - D --> F["qa parity-report"] + D --> F["openclaw qa parity-report"] E --> F F --> G["qa-agentic-parity-report.md"] F --> H["qa-agentic-parity-summary.json"] diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index dc09ade1c8e..f71df411641 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -78,15 +78,77 @@ describe("qa agentic parity report", () => { ); }); + it("fails the parity gate when required first-wave scenarios are missing on both sides", () => { + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { + scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], + }, + baselineSummary: { + scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(false); + expect(comparison.failures).toContain( + "Missing required first-wave parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.", + ); + }); + + it("fails the parity gate when the baseline contains suspicious pass results", () => { + const comparison = buildQaAgenticParityComparison({ + candidateLabel: "openai/gpt-5.4", + baselineLabel: "anthropic/claude-opus-4-6", + candidateSummary: { + scenarios: [ + { name: "Approval turn tool followthrough", status: "pass" }, + { name: "Model switch with tool continuity", status: "pass" }, + { name: "Source and docs discovery report", status: "pass" }, + { name: "Image understanding from attachment", status: "pass" }, + ], + }, + baselineSummary: { + scenarios: [ + { + name: "Approval turn tool followthrough", + status: "pass", + details: "timed out before it continued", + }, + { name: "Model switch with tool continuity", status: "pass" }, + { name: "Source and docs discovery report", status: "pass" }, + { name: "Image understanding from attachment", status: "pass" }, + ], + }, + comparedAt: "2026-04-11T00:00:00.000Z", + }); + + expect(comparison.pass).toBe(false); + expect(comparison.failures).toContain( + "anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.", + ); + }); + it("renders a readable markdown parity report", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.4", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { - scenarios: [{ name: "Scenario A", status: "pass" }], + scenarios: [ + { name: "Approval turn tool followthrough", status: "pass" }, + { name: "Model switch with tool continuity", status: "pass" }, + { name: "Source and docs discovery report", status: "pass" }, + { name: "Image understanding from attachment", status: "pass" }, + ], }, baselineSummary: { - scenarios: [{ name: "Scenario A", status: "pass" }], + scenarios: [ + { name: "Approval turn tool followthrough", status: "pass" }, + { name: "Model switch with tool continuity", status: "pass" }, + { name: "Source and docs discovery report", status: "pass" }, + { name: "Image understanding from attachment", status: "pass" }, + ], }, comparedAt: "2026-04-11T00:00:00.000Z", }); @@ -95,7 +157,7 @@ describe("qa agentic parity report", () => { expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report"); expect(report).toContain("| Completion rate | 100.0% | 100.0% |"); - expect(report).toContain("### Scenario A"); + expect(report).toContain("### Approval turn tool followthrough"); expect(report).toContain("- Verdict: pass"); }); }); diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts index 8b3f12863e1..8cd0bb45a3b 100644 --- a/extensions/qa-lab/src/agentic-parity-report.ts +++ b/extensions/qa-lab/src/agentic-parity-report.ts @@ -1,3 +1,5 @@ +import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js"; + export type QaParityReportStep = { name: string; status: "pass" | "fail" | "skip"; @@ -170,6 +172,14 @@ export function buildQaAgenticParityComparison(params: { }); const failures: string[] = []; + const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.filter( + (name) => !candidateByName.has(name) || !baselineByName.has(name), + ); + for (const name of requiredScenarioCoverage) { + failures.push( + `Missing required first-wave parity scenario coverage for ${name}: ${params.candidateLabel}=${candidateByName.has(name) ? "present" : "missing"}, ${params.baselineLabel}=${baselineByName.has(name) ? "present" : "missing"}.`, + ); + } const coverageMismatch = scenarioComparisons.filter( (scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing", ); @@ -198,6 +208,11 @@ export function buildQaAgenticParityComparison(params: { `${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`, ); } + if (baselineMetrics.fakeSuccessCount > 0) { + failures.push( + `${params.baselineLabel} produced ${baselineMetrics.fakeSuccessCount} suspicious pass result(s); baseline fake-success count must also be 0.`, + ); + } return { candidateLabel: params.candidateLabel, diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts index d3021f6d0f2..73a59080360 100644 --- a/extensions/qa-lab/src/agentic-parity.ts +++ b/extensions/qa-lab/src/agentic-parity.ts @@ -1,12 +1,29 @@ export const QA_AGENTIC_PARITY_PACK = "agentic"; -export const QA_AGENTIC_PARITY_SCENARIO_IDS = [ - "approval-turn-tool-followthrough", - "model-switch-tool-continuity", - "source-docs-discovery-report", - "image-understanding-attachment", +export const QA_AGENTIC_PARITY_SCENARIOS = [ + { + id: "approval-turn-tool-followthrough", + title: "Approval turn tool followthrough", + }, + { + id: "model-switch-tool-continuity", + title: "Model switch with tool continuity", + }, + { + id: "source-docs-discovery-report", + title: "Source and docs discovery report", + }, + { + id: "image-understanding-attachment", + title: "Image understanding from attachment", + }, ] as const; +export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id); +export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map( + ({ title }) => title, +); + export function resolveQaParityPackScenarioIds(params: { parityPack?: string; scenarioIds?: string[]; diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index b3ff349a353..8b7b5012db7 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -1,3 +1,5 @@ +import fs from "node:fs/promises"; +import os from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; @@ -70,6 +72,7 @@ import { runQaDockerUpCommand, runQaCharacterEvalCommand, runQaManualLaneCommand, + runQaParityReportCommand, runQaSuiteCommand, } from "./cli.runtime.js"; import { runQaMatrixCommand } from "./live-transports/matrix/cli.runtime.js"; @@ -344,6 +347,41 @@ describe("qa cli runtime", () => { }), ).rejects.toThrow("--cli-auth-mode must be one of auto, api-key, subscription"); }); + + it("sets a failing exit code when the parity gate fails", async () => { + const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-parity-")); + const priorExitCode = process.exitCode; + process.exitCode = undefined; + + try { + await fs.writeFile( + path.join(repoRoot, "candidate.json"), + JSON.stringify({ + scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], + }), + "utf8", + ); + await fs.writeFile( + path.join(repoRoot, "baseline.json"), + JSON.stringify({ + scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], + }), + "utf8", + ); + + await runQaParityReportCommand({ + repoRoot, + candidateSummary: "candidate.json", + baselineSummary: "baseline.json", + }); + + expect(process.exitCode).toBe(1); + } finally { + process.exitCode = priorExitCode; + await fs.rm(repoRoot, { recursive: true, force: true }); + } + }); + it("resolves character eval paths and passes model refs through", async () => { await runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts index 5fadfcc7262..ae5f4de09cc 100644 --- a/extensions/qa-lab/src/cli.runtime.ts +++ b/extensions/qa-lab/src/cli.runtime.ts @@ -331,6 +331,9 @@ export async function runQaParityReportCommand(opts: { process.stdout.write(`QA parity report: ${reportPath}\n`); process.stdout.write(`QA parity summary: ${summaryPath}\n`); process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`); + if (!comparison.pass) { + process.exitCode = 1; + } } export async function runQaCharacterEvalCommand(opts: { repoRoot?: string;