From 55df6f11a4e3b4e3c74b84ebdbd5a4262288153b Mon Sep 17 00:00:00 2001
From: Eva <eva@100yen.org>
Date: Sat, 11 Apr 2026 04:17:57 +0700
Subject: [PATCH] fix: harden parity gate review findings

---
 docs/help/gpt54-codex-agentic-parity.md       |  4 +-
 .../qa-lab/src/agentic-parity-report.test.ts  | 68 ++++++++++++++++++-
 .../qa-lab/src/agentic-parity-report.ts       | 15 ++++
 extensions/qa-lab/src/agentic-parity.ts       | 27 ++++++--
 extensions/qa-lab/src/cli.runtime.test.ts     | 38 +++++++++++
 extensions/qa-lab/src/cli.runtime.ts          |  3 +
 6 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt54-codex-agentic-parity.md
index 6f171a60334..62ac1d40b5c 100644
--- a/docs/help/gpt54-codex-agentic-parity.md
+++ b/docs/help/gpt54-codex-agentic-parity.md
@@ -51,7 +51,7 @@ The parity pack is the proof layer. It does not change runtime behavior by itsel
 After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:
 
 ```bash
-pnpm qa parity-report \
+pnpm openclaw qa parity-report \
   --repo-root . \
   --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
   --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
@@ -118,7 +118,7 @@ flowchart LR
     A --> C["Run Opus 4.6 parity pack"]
     B --> D["qa-suite-summary.json"]
     C --> E["qa-suite-summary.json"]
-    D --> F["qa parity-report"]
+    D --> F["openclaw qa parity-report"]
     E --> F
     F --> G["qa-agentic-parity-report.md"]
     F --> H["qa-agentic-parity-summary.json"]
diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts
index dc09ade1c8e..f71df411641 100644
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -78,15 +78,77 @@ describe("qa agentic parity report", () => {
     );
   });
 
+  it("fails the parity gate when required first-wave scenarios are missing on both sides", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: {
+        scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+      },
+      baselineSummary: {
+        scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "Missing required first-wave parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.",
+    );
+  });
+
+  it("fails the parity gate when the baseline contains suspicious pass results", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: {
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
+      },
+      baselineSummary: {
+        scenarios: [
+          {
+            name: "Approval turn tool followthrough",
+            status: "pass",
+            details: "timed out before it continued",
+          },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
+    );
+  });
+
   it("renders a readable markdown parity report", () => {
     const comparison = buildQaAgenticParityComparison({
       candidateLabel: "openai/gpt-5.4",
       baselineLabel: "anthropic/claude-opus-4-6",
       candidateSummary: {
-        scenarios: [{ name: "Scenario A", status: "pass" }],
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
       },
       baselineSummary: {
-        scenarios: [{ name: "Scenario A", status: "pass" }],
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
       },
       comparedAt: "2026-04-11T00:00:00.000Z",
     });
@@ -95,7 +157,7 @@ describe("qa agentic parity report", () => {
 
     expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
     expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
-    expect(report).toContain("### Scenario A");
+    expect(report).toContain("### Approval turn tool followthrough");
     expect(report).toContain("- Verdict: pass");
   });
 });
diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts
index 8b3f12863e1..8cd0bb45a3b 100644
--- a/extensions/qa-lab/src/agentic-parity-report.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.ts
@@ -1,3 +1,5 @@
+import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js";
+
 export type QaParityReportStep = {
   name: string;
   status: "pass" | "fail" | "skip";
@@ -170,6 +172,14 @@ export function buildQaAgenticParityComparison(params: {
     });
 
   const failures: string[] = [];
+  const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.filter(
+    (name) => !candidateByName.has(name) || !baselineByName.has(name),
+  );
+  for (const name of requiredScenarioCoverage) {
+    failures.push(
+      `Missing required first-wave parity scenario coverage for ${name}: ${params.candidateLabel}=${candidateByName.has(name) ? "present" : "missing"}, ${params.baselineLabel}=${baselineByName.has(name) ? "present" : "missing"}.`,
+    );
+  }
   const coverageMismatch = scenarioComparisons.filter(
     (scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing",
   );
@@ -198,6 +208,11 @@ export function buildQaAgenticParityComparison(params: {
       `${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
     );
   }
+  if (baselineMetrics.fakeSuccessCount > 0) {
+    failures.push(
+      `${params.baselineLabel} produced ${baselineMetrics.fakeSuccessCount} suspicious pass result(s); baseline fake-success count must also be 0.`,
+    );
+  }
 
   return {
     candidateLabel: params.candidateLabel,
diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts
index d3021f6d0f2..73a59080360 100644
--- a/extensions/qa-lab/src/agentic-parity.ts
+++ b/extensions/qa-lab/src/agentic-parity.ts
@@ -1,12 +1,29 @@
 export const QA_AGENTIC_PARITY_PACK = "agentic";
 
-export const QA_AGENTIC_PARITY_SCENARIO_IDS = [
-  "approval-turn-tool-followthrough",
-  "model-switch-tool-continuity",
-  "source-docs-discovery-report",
-  "image-understanding-attachment",
+export const QA_AGENTIC_PARITY_SCENARIOS = [
+  {
+    id: "approval-turn-tool-followthrough",
+    title: "Approval turn tool followthrough",
+  },
+  {
+    id: "model-switch-tool-continuity",
+    title: "Model switch with tool continuity",
+  },
+  {
+    id: "source-docs-discovery-report",
+    title: "Source and docs discovery report",
+  },
+  {
+    id: "image-understanding-attachment",
+    title: "Image understanding from attachment",
+  },
 ] as const;
 
+export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id);
+export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map(
+  ({ title }) => title,
+);
+
 export function resolveQaParityPackScenarioIds(params: {
   parityPack?: string;
   scenarioIds?: string[];
diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts
index b3ff349a353..8b7b5012db7 100644
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -1,3 +1,5 @@
+import fs from "node:fs/promises";
+import os from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
@@ -70,6 +72,7 @@ import {
   runQaDockerUpCommand,
   runQaCharacterEvalCommand,
   runQaManualLaneCommand,
+  runQaParityReportCommand,
   runQaSuiteCommand,
 } from "./cli.runtime.js";
 import { runQaMatrixCommand } from "./live-transports/matrix/cli.runtime.js";
@@ -344,6 +347,41 @@ describe("qa cli runtime", () => {
       }),
     ).rejects.toThrow("--cli-auth-mode must be one of auto, api-key, subscription");
   });
+
+  it("sets a failing exit code when the parity gate fails", async () => {
+    const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-parity-"));
+    const priorExitCode = process.exitCode;
+    process.exitCode = undefined;
+
+    try {
+      await fs.writeFile(
+        path.join(repoRoot, "candidate.json"),
+        JSON.stringify({
+          scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+        }),
+        "utf8",
+      );
+      await fs.writeFile(
+        path.join(repoRoot, "baseline.json"),
+        JSON.stringify({
+          scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+        }),
+        "utf8",
+      );
+
+      await runQaParityReportCommand({
+        repoRoot,
+        candidateSummary: "candidate.json",
+        baselineSummary: "baseline.json",
+      });
+
+      expect(process.exitCode).toBe(1);
+    } finally {
+      process.exitCode = priorExitCode;
+      await fs.rm(repoRoot, { recursive: true, force: true });
+    }
+  });
+
   it("resolves character eval paths and passes model refs through", async () => {
     await runQaCharacterEvalCommand({
       repoRoot: "/tmp/openclaw-repo",
diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts
index 5fadfcc7262..ae5f4de09cc 100644
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -331,6 +331,9 @@ export async function runQaParityReportCommand(opts: {
   process.stdout.write(`QA parity report: ${reportPath}\n`);
   process.stdout.write(`QA parity summary: ${summaryPath}\n`);
   process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
+  if (!comparison.pass) {
+    process.exitCode = 1;
+  }
 }
 export async function runQaCharacterEvalCommand(opts: {
   repoRoot?: string;