test(release): stabilize qa runtime parity gate

This commit is contained in:
Vincent Koc
2026-06-08 21:02:00 +02:00
parent bad449301f
commit c7b01cf201
6 changed files with 63 additions and 22 deletions

View File

@@ -889,14 +889,14 @@ status=done`,
expect(report.failures).toEqual([]);
});
it("fails runtime parity reports when a runtime cell fails", () => {
it("fails runtime parity reports when a runtime cell has a hard failure", () => {
const summary = makeRuntimeParitySummary();
const scenario = summary.scenarios[1];
if (!scenario?.runtimeParity) {
throw new Error("runtime parity fixture missing");
}
scenario.status = "fail";
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
scenario.runtimeParity.cells.codex.runtimeErrorClass = "auth";
const report = buildQaRuntimeParityReport({
summary,
@@ -910,6 +910,24 @@ status=done`,
);
});
it("passes runtime parity reports with controlled tool-error cells and advisory drift", () => {
const summary = makeRuntimeParitySummary();
const scenario = summary.scenarios[1];
if (!scenario?.runtimeParity) {
throw new Error("runtime parity fixture missing");
}
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
const report = buildQaRuntimeParityReport({
summary,
comparedAt: "2026-05-10T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.failedScenarios).toBe(0);
expect(report.failures).toEqual([]);
});
it("fails live runtime parity reports when assistant-message usage is missing", () => {
const summary = makeRuntimeParitySummary();
summary.run = {

View File

@@ -906,7 +906,6 @@ describe("qa cli runtime", () => {
"runtime-tool-fs-read",
"runtime-tool-fs-write",
"runtime-tool-grep",
"runtime-tool-image-generate",
"runtime-tool-session-status",
"runtime-tool-sessions-spawn",
"runtime-tool-web-fetch",
@@ -924,6 +923,7 @@ describe("qa cli runtime", () => {
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
scenarioIds: [
"runtime-soak-100-turn",
"runtime-tool-image-generate",
"runtime-tool-memory-add",
"runtime-tool-memory-recall",
"runtime-tool-message-tool",
@@ -1040,7 +1040,7 @@ describe("qa cli runtime", () => {
},
},
],
counts: { total: 1, passed: 0, failed: 1 },
counts: { total: 1, passed: 1, failed: 0 },
run: {
providerMode: "mock-openai",
primaryModel: "openai/gpt-5.5",
@@ -1056,12 +1056,12 @@ describe("qa cli runtime", () => {
summary: "runtime-summary.json",
});
expect(process.exitCode).toBe(1);
expect(process.exitCode).toBeUndefined();
expect(stdoutWrite).toHaveBeenCalledWith(
expect.stringContaining("QA runtime parity report:"),
);
expect(stdoutWrite).toHaveBeenCalledWith(
expect.stringContaining("QA runtime parity verdict: fail"),
expect.stringContaining("QA runtime parity verdict: pass"),
);
} finally {
process.exitCode = priorExitCode;

View File

@@ -2,6 +2,7 @@
import { describe, expect, it } from "vitest";
import {
__testing,
isRuntimeParityResultPass,
runRuntimeParityScenario,
type RuntimeId,
type RuntimeParityCell,
@@ -95,18 +96,22 @@ describe("runtime parity", () => {
scenarioId: "matching-tool-errors",
runCell: async (runtime) => ({
scenarioStatus: "pass",
cell: makeRuntimeParityCell(runtime, [
{
tool: "web_search",
argsHash: "same-args",
resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
errorClass: "tool-result-error",
},
]),
cell: {
...makeRuntimeParityCell(runtime, [
{
tool: "web_search",
argsHash: "same-args",
resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
errorClass: "tool-result-error",
},
]),
...(runtime === "codex" ? { runtimeErrorClass: "tool-error" } : {}),
},
}),
});
expect(result.drift).toBe("none");
expect(isRuntimeParityResultPass(result)).toBe(true);
});
it("prefers transcript tool results when mock debug rows are incomplete", () => {

View File

@@ -81,8 +81,8 @@ export function runtimeParityCellStatus(
export function isRuntimeParityResultPass(result: RuntimeParityResult) {
return (
result.drift !== "failure-mode" &&
runtimeParityCellStatus(result.cells.openclaw) === "pass" &&
runtimeParityCellStatus(result.cells.codex) === "pass"
isRuntimeParityCellPassable(result.cells.openclaw) &&
isRuntimeParityCellPassable(result.cells.codex)
);
}
@@ -771,6 +771,13 @@ function isHardFailureRuntimeError(errorClass: string | undefined) {
);
}
function isRuntimeParityCellPassable(cell: RuntimeParityCell | undefined) {
if (!cell || cell.transportErrorClass || isHardFailureRuntimeError(cell.runtimeErrorClass)) {
return false;
}
return !cell.runtimeErrorClass || cell.runtimeErrorClass === "tool-error";
}
function hasMissingToolResult(toolCalls: readonly RuntimeParityToolCall[]) {
return toolCalls.some((toolCall) => toolCall.errorClass === TOOL_RESULT_MISSING_ERROR_CLASS);
}
@@ -869,8 +876,8 @@ function classifyRuntimeParityCells(params: {
if (
params.openclawScenarioStatus === "fail" ||
params.codexScenarioStatus === "fail" ||
params.openclaw.runtimeErrorClass ||
params.codex.runtimeErrorClass
!isRuntimeParityCellPassable(params.openclaw) ||
!isRuntimeParityCellPassable(params.codex)
) {
return {
drift: "failure-mode",

View File

@@ -125,10 +125,12 @@ describe("qa scenario catalog", () => {
const messageTool = readQaScenarioById("runtime-tool-message-tool");
const tavilySearch = readQaScenarioById("runtime-tool-tavily-search");
const webSearch = readQaScenarioById("runtime-tool-web-search");
const imageGenerate = readQaScenarioById("runtime-tool-image-generate");
expect(applyPatch.runtimeParityTier).toBe("standard");
expect(messageTool.runtimeParityTier).toBe("optional");
expect(tavilySearch.runtimeParityTier).toBe("optional");
expect(imageGenerate.runtimeParityTier).toBe("optional");
expect(readQaScenarioExecutionConfig(applyPatch.id)).toMatchObject({
toolName: "apply_patch",
toolCoverage: {
@@ -155,6 +157,15 @@ describe("qa scenario catalog", () => {
},
});
expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap");
expect(readQaScenarioExecutionConfig(imageGenerate.id)).toMatchObject({
toolName: "image_generate",
toolCoverage: {
bucket: "openclaw-dynamic-integration",
expectedLayer: "openclaw-dynamic",
capabilityLayer: "openclaw-dynamic-direct",
required: true,
},
});
});
it("loads the Codex legacy Read vocabulary live parity canary", () => {

View File

@@ -4,7 +4,7 @@
id: runtime-tool-image-generate
title: Runtime tool fixture — image_generate
surface: runtime-tools
runtimeParityTier: standard
runtimeParityTier: optional
coverage:
primary:
- tools.image-generate
@@ -13,7 +13,7 @@ successCriteria:
- Effective tools expose image_generate after QA image-generation config is applied.
- The mock provider plans exactly one happy-path image_generate call.
- The mock provider plans one denied-input failure-path image_generate call.
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
- Runtime parity coverage records async image start/result drift outside the standard direct-loading gate.
docsRefs:
- docs/tools/image-generation.md
codeRefs:
@@ -34,8 +34,8 @@ execution:
required: true
codexDefaultImpact: P4
qaImpact: P1
action: hard gate in the standard direct-loading tier
reason: image_generate is an OpenClaw integration tool and must stay visible and callable under OpenClaw and Codex direct runtime parity.
action: optional runtime parity gate with async image completion coverage
reason: image_generate is an OpenClaw integration tool whose happy path yields for async completion, so standard direct call/result parity would compare different lifecycle phases.
promptSnippet: "target=image_generate"
failurePromptSnippet: "failure target=image_generate"
```