fix: harden parity gate review findings

2026-04-12 01:31:08 +00:00 · 2026-04-11 04:17:57 +07:00
parent c73d005c7a
commit 55df6f11a4
6 changed files with 145 additions and 10 deletions
--- a/docs/help/gpt54-codex-agentic-parity.md
+++ b/docs/help/gpt54-codex-agentic-parity.md
@@ -51,7 +51,7 @@ The parity pack is the proof layer. It does not change runtime behavior by itsel
 After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:

 ```bash
-pnpm qa parity-report \
+pnpm openclaw qa parity-report \
  --repo-root . \
  --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
  --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
@@ -118,7 +118,7 @@ flowchart LR
    A --> C["Run Opus 4.6 parity pack"]
    B --> D["qa-suite-summary.json"]
    C --> E["qa-suite-summary.json"]
-    D --> F["qa parity-report"]
+    D --> F["openclaw qa parity-report"]
    E --> F
    F --> G["qa-agentic-parity-report.md"]
    F --> H["qa-agentic-parity-summary.json"]
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -78,15 +78,77 @@ describe("qa agentic parity report", () => {
    );
  });

+  it("fails the parity gate when required first-wave scenarios are missing on both sides", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: {
+        scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+      },
+      baselineSummary: {
+        scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "Missing required first-wave parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.",
+    );
+  });
+
+  it("fails the parity gate when the baseline contains suspicious pass results", () => {
+    const comparison = buildQaAgenticParityComparison({
+      candidateLabel: "openai/gpt-5.4",
+      baselineLabel: "anthropic/claude-opus-4-6",
+      candidateSummary: {
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
+      },
+      baselineSummary: {
+        scenarios: [
+          {
+            name: "Approval turn tool followthrough",
+            status: "pass",
+            details: "timed out before it continued",
+          },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
+      },
+      comparedAt: "2026-04-11T00:00:00.000Z",
+    });
+
+    expect(comparison.pass).toBe(false);
+    expect(comparison.failures).toContain(
+      "anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
+    );
+  });
+
  it("renders a readable markdown parity report", () => {
    const comparison = buildQaAgenticParityComparison({
      candidateLabel: "openai/gpt-5.4",
      baselineLabel: "anthropic/claude-opus-4-6",
      candidateSummary: {
-        scenarios: [{ name: "Scenario A", status: "pass" }],
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
      },
      baselineSummary: {
-        scenarios: [{ name: "Scenario A", status: "pass" }],
+        scenarios: [
+          { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Model switch with tool continuity", status: "pass" },
+          { name: "Source and docs discovery report", status: "pass" },
+          { name: "Image understanding from attachment", status: "pass" },
+        ],
      },
      comparedAt: "2026-04-11T00:00:00.000Z",
    });
@@ -95,7 +157,7 @@ describe("qa agentic parity report", () => {

    expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
    expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
-    expect(report).toContain("### Scenario A");
+    expect(report).toContain("### Approval turn tool followthrough");
    expect(report).toContain("- Verdict: pass");
  });
 });
--- a/extensions/qa-lab/src/agentic-parity-report.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.ts
@@ -1,3 +1,5 @@
+import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js";
+
 export type QaParityReportStep = {
  name: string;
  status: "pass" | "fail" | "skip";
@@ -170,6 +172,14 @@ export function buildQaAgenticParityComparison(params: {
    });

  const failures: string[] = [];
+  const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.filter(
+    (name) => !candidateByName.has(name) || !baselineByName.has(name),
+  );
+  for (const name of requiredScenarioCoverage) {
+    failures.push(
+      `Missing required first-wave parity scenario coverage for ${name}: ${params.candidateLabel}=${candidateByName.has(name) ? "present" : "missing"}, ${params.baselineLabel}=${baselineByName.has(name) ? "present" : "missing"}.`,
+    );
+  }
  const coverageMismatch = scenarioComparisons.filter(
    (scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing",
  );
@@ -198,6 +208,11 @@ export function buildQaAgenticParityComparison(params: {
      `${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
    );
  }
+  if (baselineMetrics.fakeSuccessCount > 0) {
+    failures.push(
+      `${params.baselineLabel} produced ${baselineMetrics.fakeSuccessCount} suspicious pass result(s); baseline fake-success count must also be 0.`,
+    );
+  }

  return {
    candidateLabel: params.candidateLabel,
--- a/extensions/qa-lab/src/agentic-parity.ts
+++ b/extensions/qa-lab/src/agentic-parity.ts
@@ -1,12 +1,29 @@
 export const QA_AGENTIC_PARITY_PACK = "agentic";

-export const QA_AGENTIC_PARITY_SCENARIO_IDS = [
-  "approval-turn-tool-followthrough",
-  "model-switch-tool-continuity",
-  "source-docs-discovery-report",
-  "image-understanding-attachment",
+export const QA_AGENTIC_PARITY_SCENARIOS = [
+  {
+    id: "approval-turn-tool-followthrough",
+    title: "Approval turn tool followthrough",
+  },
+  {
+    id: "model-switch-tool-continuity",
+    title: "Model switch with tool continuity",
+  },
+  {
+    id: "source-docs-discovery-report",
+    title: "Source and docs discovery report",
+  },
+  {
+    id: "image-understanding-attachment",
+    title: "Image understanding from attachment",
+  },
 ] as const;

+export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id);
+export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map(
+  ({ title }) => title,
+);
+
 export function resolveQaParityPackScenarioIds(params: {
  parityPack?: string;
  scenarioIds?: string[];
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -1,3 +1,5 @@
+import fs from "node:fs/promises";
+import os from "node:os";
 import path from "node:path";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

@@ -70,6 +72,7 @@ import {
  runQaDockerUpCommand,
  runQaCharacterEvalCommand,
  runQaManualLaneCommand,
+  runQaParityReportCommand,
  runQaSuiteCommand,
 } from "./cli.runtime.js";
 import { runQaMatrixCommand } from "./live-transports/matrix/cli.runtime.js";
@@ -344,6 +347,41 @@ describe("qa cli runtime", () => {
      }),
    ).rejects.toThrow("--cli-auth-mode must be one of auto, api-key, subscription");
  });
+
+  it("sets a failing exit code when the parity gate fails", async () => {
+    const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-parity-"));
+    const priorExitCode = process.exitCode;
+    process.exitCode = undefined;
+
+    try {
+      await fs.writeFile(
+        path.join(repoRoot, "candidate.json"),
+        JSON.stringify({
+          scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+        }),
+        "utf8",
+      );
+      await fs.writeFile(
+        path.join(repoRoot, "baseline.json"),
+        JSON.stringify({
+          scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
+        }),
+        "utf8",
+      );
+
+      await runQaParityReportCommand({
+        repoRoot,
+        candidateSummary: "candidate.json",
+        baselineSummary: "baseline.json",
+      });
+
+      expect(process.exitCode).toBe(1);
+    } finally {
+      process.exitCode = priorExitCode;
+      await fs.rm(repoRoot, { recursive: true, force: true });
+    }
+  });
+
  it("resolves character eval paths and passes model refs through", async () => {
    await runQaCharacterEvalCommand({
      repoRoot: "/tmp/openclaw-repo",
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -331,6 +331,9 @@ export async function runQaParityReportCommand(opts: {
  process.stdout.write(`QA parity report: ${reportPath}\n`);
  process.stdout.write(`QA parity summary: ${summaryPath}\n`);
  process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
+  if (!comparison.pass) {
+    process.exitCode = 1;
+  }
 }
 export async function runQaCharacterEvalCommand(opts: {
  repoRoot?: string;