mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-12 01:31:08 +00:00
fix: harden parity gate review findings
This commit is contained in:
@@ -51,7 +51,7 @@ The parity pack is the proof layer. It does not change runtime behavior by itsel
|
||||
After you have two `qa-suite-summary.json` artifacts, generate the release-gate comparison with:
|
||||
|
||||
```bash
|
||||
pnpm qa parity-report \
|
||||
pnpm openclaw qa parity-report \
|
||||
--repo-root . \
|
||||
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
|
||||
@@ -118,7 +118,7 @@ flowchart LR
|
||||
A --> C["Run Opus 4.6 parity pack"]
|
||||
B --> D["qa-suite-summary.json"]
|
||||
C --> E["qa-suite-summary.json"]
|
||||
D --> F["qa parity-report"]
|
||||
D --> F["openclaw qa parity-report"]
|
||||
E --> F
|
||||
F --> G["qa-agentic-parity-report.md"]
|
||||
F --> H["qa-agentic-parity-summary.json"]
|
||||
|
||||
@@ -78,15 +78,77 @@ describe("qa agentic parity report", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("fails the parity gate when required first-wave scenarios are missing on both sides", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.4",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
candidateSummary: {
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Missing required first-wave parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.",
|
||||
);
|
||||
});
|
||||
|
||||
it("fails the parity gate when the baseline contains suspicious pass results", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.4",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
candidateSummary: {
|
||||
scenarios: [
|
||||
{ name: "Approval turn tool followthrough", status: "pass" },
|
||||
{ name: "Model switch with tool continuity", status: "pass" },
|
||||
{ name: "Source and docs discovery report", status: "pass" },
|
||||
{ name: "Image understanding from attachment", status: "pass" },
|
||||
],
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: [
|
||||
{
|
||||
name: "Approval turn tool followthrough",
|
||||
status: "pass",
|
||||
details: "timed out before it continued",
|
||||
},
|
||||
{ name: "Model switch with tool continuity", status: "pass" },
|
||||
{ name: "Source and docs discovery report", status: "pass" },
|
||||
{ name: "Image understanding from attachment", status: "pass" },
|
||||
],
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
|
||||
);
|
||||
});
|
||||
|
||||
it("renders a readable markdown parity report", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.4",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
candidateSummary: {
|
||||
scenarios: [{ name: "Scenario A", status: "pass" }],
|
||||
scenarios: [
|
||||
{ name: "Approval turn tool followthrough", status: "pass" },
|
||||
{ name: "Model switch with tool continuity", status: "pass" },
|
||||
{ name: "Source and docs discovery report", status: "pass" },
|
||||
{ name: "Image understanding from attachment", status: "pass" },
|
||||
],
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: [{ name: "Scenario A", status: "pass" }],
|
||||
scenarios: [
|
||||
{ name: "Approval turn tool followthrough", status: "pass" },
|
||||
{ name: "Model switch with tool continuity", status: "pass" },
|
||||
{ name: "Source and docs discovery report", status: "pass" },
|
||||
{ name: "Image understanding from attachment", status: "pass" },
|
||||
],
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
});
|
||||
@@ -95,7 +157,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(report).toContain("# OpenClaw GPT-5.4 / Opus 4.6 Agentic Parity Report");
|
||||
expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
|
||||
expect(report).toContain("### Scenario A");
|
||||
expect(report).toContain("### Approval turn tool followthrough");
|
||||
expect(report).toContain("- Verdict: pass");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import { QA_AGENTIC_PARITY_SCENARIO_TITLES } from "./agentic-parity.js";
|
||||
|
||||
export type QaParityReportStep = {
|
||||
name: string;
|
||||
status: "pass" | "fail" | "skip";
|
||||
@@ -170,6 +172,14 @@ export function buildQaAgenticParityComparison(params: {
|
||||
});
|
||||
|
||||
const failures: string[] = [];
|
||||
const requiredScenarioCoverage = QA_AGENTIC_PARITY_SCENARIO_TITLES.filter(
|
||||
(name) => !candidateByName.has(name) || !baselineByName.has(name),
|
||||
);
|
||||
for (const name of requiredScenarioCoverage) {
|
||||
failures.push(
|
||||
`Missing required first-wave parity scenario coverage for ${name}: ${params.candidateLabel}=${candidateByName.has(name) ? "present" : "missing"}, ${params.baselineLabel}=${baselineByName.has(name) ? "present" : "missing"}.`,
|
||||
);
|
||||
}
|
||||
const coverageMismatch = scenarioComparisons.filter(
|
||||
(scenario) => scenario.candidateStatus === "missing" || scenario.baselineStatus === "missing",
|
||||
);
|
||||
@@ -198,6 +208,11 @@ export function buildQaAgenticParityComparison(params: {
|
||||
`${params.candidateLabel} produced ${candidateMetrics.fakeSuccessCount} suspicious pass result(s); fake-success count must be 0.`,
|
||||
);
|
||||
}
|
||||
if (baselineMetrics.fakeSuccessCount > 0) {
|
||||
failures.push(
|
||||
`${params.baselineLabel} produced ${baselineMetrics.fakeSuccessCount} suspicious pass result(s); baseline fake-success count must also be 0.`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
candidateLabel: params.candidateLabel,
|
||||
|
||||
@@ -1,12 +1,29 @@
|
||||
export const QA_AGENTIC_PARITY_PACK = "agentic";
|
||||
|
||||
export const QA_AGENTIC_PARITY_SCENARIO_IDS = [
|
||||
"approval-turn-tool-followthrough",
|
||||
"model-switch-tool-continuity",
|
||||
"source-docs-discovery-report",
|
||||
"image-understanding-attachment",
|
||||
export const QA_AGENTIC_PARITY_SCENARIOS = [
|
||||
{
|
||||
id: "approval-turn-tool-followthrough",
|
||||
title: "Approval turn tool followthrough",
|
||||
},
|
||||
{
|
||||
id: "model-switch-tool-continuity",
|
||||
title: "Model switch with tool continuity",
|
||||
},
|
||||
{
|
||||
id: "source-docs-discovery-report",
|
||||
title: "Source and docs discovery report",
|
||||
},
|
||||
{
|
||||
id: "image-understanding-attachment",
|
||||
title: "Image understanding from attachment",
|
||||
},
|
||||
] as const;
|
||||
|
||||
export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id);
|
||||
export const QA_AGENTIC_PARITY_SCENARIO_TITLES = QA_AGENTIC_PARITY_SCENARIOS.map(
|
||||
({ title }) => title,
|
||||
);
|
||||
|
||||
export function resolveQaParityPackScenarioIds(params: {
|
||||
parityPack?: string;
|
||||
scenarioIds?: string[];
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
@@ -70,6 +72,7 @@ import {
|
||||
runQaDockerUpCommand,
|
||||
runQaCharacterEvalCommand,
|
||||
runQaManualLaneCommand,
|
||||
runQaParityReportCommand,
|
||||
runQaSuiteCommand,
|
||||
} from "./cli.runtime.js";
|
||||
import { runQaMatrixCommand } from "./live-transports/matrix/cli.runtime.js";
|
||||
@@ -344,6 +347,41 @@ describe("qa cli runtime", () => {
|
||||
}),
|
||||
).rejects.toThrow("--cli-auth-mode must be one of auto, api-key, subscription");
|
||||
});
|
||||
|
||||
it("sets a failing exit code when the parity gate fails", async () => {
|
||||
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-parity-"));
|
||||
const priorExitCode = process.exitCode;
|
||||
process.exitCode = undefined;
|
||||
|
||||
try {
|
||||
await fs.writeFile(
|
||||
path.join(repoRoot, "candidate.json"),
|
||||
JSON.stringify({
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
await fs.writeFile(
|
||||
path.join(repoRoot, "baseline.json"),
|
||||
JSON.stringify({
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await runQaParityReportCommand({
|
||||
repoRoot,
|
||||
candidateSummary: "candidate.json",
|
||||
baselineSummary: "baseline.json",
|
||||
});
|
||||
|
||||
expect(process.exitCode).toBe(1);
|
||||
} finally {
|
||||
process.exitCode = priorExitCode;
|
||||
await fs.rm(repoRoot, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("resolves character eval paths and passes model refs through", async () => {
|
||||
await runQaCharacterEvalCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
|
||||
@@ -331,6 +331,9 @@ export async function runQaParityReportCommand(opts: {
|
||||
process.stdout.write(`QA parity report: ${reportPath}\n`);
|
||||
process.stdout.write(`QA parity summary: ${summaryPath}\n`);
|
||||
process.stdout.write(`QA parity verdict: ${comparison.pass ? "pass" : "fail"}\n`);
|
||||
if (!comparison.pass) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
export async function runQaCharacterEvalCommand(opts: {
|
||||
repoRoot?: string;
|
||||
|
||||
Reference in New Issue
Block a user