From c7b01cf201f736a3be7e98256d13db9775aec548 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 8 Jun 2026 21:02:00 +0200 Subject: [PATCH] test(release): stabilize qa runtime parity gate --- .../qa-lab/src/agentic-parity-report.test.ts | 22 +++++++++++++++++-- extensions/qa-lab/src/cli.runtime.test.ts | 8 +++---- extensions/qa-lab/src/runtime-parity.test.ts | 21 +++++++++++------- extensions/qa-lab/src/runtime-parity.ts | 15 +++++++++---- .../qa-lab/src/scenario-catalog.test.ts | 11 ++++++++++ qa/scenarios/runtime/tools/image-generate.md | 8 +++---- 6 files changed, 63 insertions(+), 22 deletions(-) diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index ee222a7a938..e8d660256bc 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -889,14 +889,14 @@ status=done`, expect(report.failures).toEqual([]); }); - it("fails runtime parity reports when a runtime cell fails", () => { + it("fails runtime parity reports when a runtime cell has a hard failure", () => { const summary = makeRuntimeParitySummary(); const scenario = summary.scenarios[1]; if (!scenario?.runtimeParity) { throw new Error("runtime parity fixture missing"); } scenario.status = "fail"; - scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error"; + scenario.runtimeParity.cells.codex.runtimeErrorClass = "auth"; const report = buildQaRuntimeParityReport({ summary, @@ -910,6 +910,24 @@ status=done`, ); }); + it("passes runtime parity reports with controlled tool-error cells and advisory drift", () => { + const summary = makeRuntimeParitySummary(); + const scenario = summary.scenarios[1]; + if (!scenario?.runtimeParity) { + throw new Error("runtime parity fixture missing"); + } + scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error"; + + const report = buildQaRuntimeParityReport({ + summary, + comparedAt: "2026-05-10T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.failedScenarios).toBe(0); + expect(report.failures).toEqual([]); + }); + it("fails live runtime parity reports when assistant-message usage is missing", () => { const summary = makeRuntimeParitySummary(); summary.run = { diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index d8f072334f4..6f5a68e8e2f 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -906,7 +906,6 @@ describe("qa cli runtime", () => { "runtime-tool-fs-read", "runtime-tool-fs-write", "runtime-tool-grep", - "runtime-tool-image-generate", "runtime-tool-session-status", "runtime-tool-sessions-spawn", "runtime-tool-web-fetch", @@ -924,6 +923,7 @@ describe("qa cli runtime", () => { expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), { scenarioIds: [ "runtime-soak-100-turn", + "runtime-tool-image-generate", "runtime-tool-memory-add", "runtime-tool-memory-recall", "runtime-tool-message-tool", @@ -1040,7 +1040,7 @@ describe("qa cli runtime", () => { }, }, ], - counts: { total: 1, passed: 0, failed: 1 }, + counts: { total: 1, passed: 1, failed: 0 }, run: { providerMode: "mock-openai", primaryModel: "openai/gpt-5.5", @@ -1056,12 +1056,12 @@ describe("qa cli runtime", () => { summary: "runtime-summary.json", }); - expect(process.exitCode).toBe(1); + expect(process.exitCode).toBeUndefined(); expect(stdoutWrite).toHaveBeenCalledWith( expect.stringContaining("QA runtime parity report:"), ); expect(stdoutWrite).toHaveBeenCalledWith( - expect.stringContaining("QA runtime parity verdict: fail"), + expect.stringContaining("QA runtime parity verdict: pass"), ); } finally { process.exitCode = priorExitCode; diff --git a/extensions/qa-lab/src/runtime-parity.test.ts b/extensions/qa-lab/src/runtime-parity.test.ts index fe89feb7c3b..3e100f7e641 100644 --- a/extensions/qa-lab/src/runtime-parity.test.ts +++ b/extensions/qa-lab/src/runtime-parity.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest"; import { __testing, + isRuntimeParityResultPass, runRuntimeParityScenario, type RuntimeId, type RuntimeParityCell, @@ -95,18 +96,22 @@ describe("runtime parity", () => { scenarioId: "matching-tool-errors", runCell: async (runtime) => ({ scenarioStatus: "pass", - cell: makeRuntimeParityCell(runtime, [ - { - tool: "web_search", - argsHash: "same-args", - resultHash: runtime === "openclaw" ? "validation-error" : "provider-error", - errorClass: "tool-result-error", - }, - ]), + cell: { + ...makeRuntimeParityCell(runtime, [ + { + tool: "web_search", + argsHash: "same-args", + resultHash: runtime === "openclaw" ? "validation-error" : "provider-error", + errorClass: "tool-result-error", + }, + ]), + ...(runtime === "codex" ? { runtimeErrorClass: "tool-error" } : {}), + }, }), }); expect(result.drift).toBe("none"); + expect(isRuntimeParityResultPass(result)).toBe(true); }); it("prefers transcript tool results when mock debug rows are incomplete", () => { diff --git a/extensions/qa-lab/src/runtime-parity.ts b/extensions/qa-lab/src/runtime-parity.ts index 512390053dc..a6b515d52b1 100644 --- a/extensions/qa-lab/src/runtime-parity.ts +++ b/extensions/qa-lab/src/runtime-parity.ts @@ -81,8 +81,8 @@ export function runtimeParityCellStatus( export function isRuntimeParityResultPass(result: RuntimeParityResult) { return ( result.drift !== "failure-mode" && - runtimeParityCellStatus(result.cells.openclaw) === "pass" && - runtimeParityCellStatus(result.cells.codex) === "pass" + isRuntimeParityCellPassable(result.cells.openclaw) && + isRuntimeParityCellPassable(result.cells.codex) ); } @@ -771,6 +771,13 @@ function isHardFailureRuntimeError(errorClass: string | undefined) { ); } +function isRuntimeParityCellPassable(cell: RuntimeParityCell | undefined) { + if (!cell || cell.transportErrorClass || isHardFailureRuntimeError(cell.runtimeErrorClass)) { + return false; + } + return !cell.runtimeErrorClass || cell.runtimeErrorClass === "tool-error"; +} + function hasMissingToolResult(toolCalls: readonly RuntimeParityToolCall[]) { return toolCalls.some((toolCall) => toolCall.errorClass === TOOL_RESULT_MISSING_ERROR_CLASS); } @@ -869,8 +876,8 @@ function classifyRuntimeParityCells(params: { if ( params.openclawScenarioStatus === "fail" || params.codexScenarioStatus === "fail" || - params.openclaw.runtimeErrorClass || - params.codex.runtimeErrorClass + !isRuntimeParityCellPassable(params.openclaw) || + !isRuntimeParityCellPassable(params.codex) ) { return { drift: "failure-mode", diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index f806426d2c3..62aa1f505cb 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -125,10 +125,12 @@ describe("qa scenario catalog", () => { const messageTool = readQaScenarioById("runtime-tool-message-tool"); const tavilySearch = readQaScenarioById("runtime-tool-tavily-search"); const webSearch = readQaScenarioById("runtime-tool-web-search"); + const imageGenerate = readQaScenarioById("runtime-tool-image-generate"); expect(applyPatch.runtimeParityTier).toBe("standard"); expect(messageTool.runtimeParityTier).toBe("optional"); expect(tavilySearch.runtimeParityTier).toBe("optional"); + expect(imageGenerate.runtimeParityTier).toBe("optional"); expect(readQaScenarioExecutionConfig(applyPatch.id)).toMatchObject({ toolName: "apply_patch", toolCoverage: { @@ -155,6 +157,15 @@ describe("qa scenario catalog", () => { }, }); expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap"); + expect(readQaScenarioExecutionConfig(imageGenerate.id)).toMatchObject({ + toolName: "image_generate", + toolCoverage: { + bucket: "openclaw-dynamic-integration", + expectedLayer: "openclaw-dynamic", + capabilityLayer: "openclaw-dynamic-direct", + required: true, + }, + }); }); it("loads the Codex legacy Read vocabulary live parity canary", () => { diff --git a/qa/scenarios/runtime/tools/image-generate.md b/qa/scenarios/runtime/tools/image-generate.md index 19ccc3f162d..bf57cabaabf 100644 --- a/qa/scenarios/runtime/tools/image-generate.md +++ b/qa/scenarios/runtime/tools/image-generate.md @@ -4,7 +4,7 @@ id: runtime-tool-image-generate title: Runtime tool fixture — image_generate surface: runtime-tools -runtimeParityTier: standard +runtimeParityTier: optional coverage: primary: - tools.image-generate @@ -13,7 +13,7 @@ successCriteria: - Effective tools expose image_generate after QA image-generation config is applied. - The mock provider plans exactly one happy-path image_generate call. - The mock provider plans one denied-input failure-path image_generate call. - - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. + - Runtime parity coverage records async image start/result drift outside the standard direct-loading gate. docsRefs: - docs/tools/image-generation.md codeRefs: @@ -34,8 +34,8 @@ execution: required: true codexDefaultImpact: P4 qaImpact: P1 - action: hard gate in the standard direct-loading tier - reason: image_generate is an OpenClaw integration tool and must stay visible and callable under OpenClaw and Codex direct runtime parity. + action: optional runtime parity gate with async image completion coverage + reason: image_generate is an OpenClaw integration tool whose happy path yields for async completion, so standard direct call/result parity would compare different lifecycle phases. promptSnippet: "target=image_generate" failurePromptSnippet: "failure target=image_generate" ```