test(release): stabilize qa runtime parity gate

2026-06-22 17:28:11 +00:00 · 2026-06-08 21:02:00 +02:00
parent bad449301f
commit c7b01cf201
6 changed files with 63 additions and 22 deletions
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -889,14 +889,14 @@ status=done`,
    expect(report.failures).toEqual([]);
  });

-  it("fails runtime parity reports when a runtime cell fails", () => {
+  it("fails runtime parity reports when a runtime cell has a hard failure", () => {
    const summary = makeRuntimeParitySummary();
    const scenario = summary.scenarios[1];
    if (!scenario?.runtimeParity) {
      throw new Error("runtime parity fixture missing");
    }
    scenario.status = "fail";
-    scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
+    scenario.runtimeParity.cells.codex.runtimeErrorClass = "auth";

    const report = buildQaRuntimeParityReport({
      summary,
@@ -910,6 +910,24 @@ status=done`,
    );
  });

+  it("passes runtime parity reports with controlled tool-error cells and advisory drift", () => {
+    const summary = makeRuntimeParitySummary();
+    const scenario = summary.scenarios[1];
+    if (!scenario?.runtimeParity) {
+      throw new Error("runtime parity fixture missing");
+    }
+    scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
+
+    const report = buildQaRuntimeParityReport({
+      summary,
+      comparedAt: "2026-05-10T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.failedScenarios).toBe(0);
+    expect(report.failures).toEqual([]);
+  });
+
  it("fails live runtime parity reports when assistant-message usage is missing", () => {
    const summary = makeRuntimeParitySummary();
    summary.run = {
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -906,7 +906,6 @@ describe("qa cli runtime", () => {
        "runtime-tool-fs-read",
        "runtime-tool-fs-write",
        "runtime-tool-grep",
-        "runtime-tool-image-generate",
        "runtime-tool-session-status",
        "runtime-tool-sessions-spawn",
        "runtime-tool-web-fetch",
@@ -924,6 +923,7 @@ describe("qa cli runtime", () => {
    expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
      scenarioIds: [
        "runtime-soak-100-turn",
+        "runtime-tool-image-generate",
        "runtime-tool-memory-add",
        "runtime-tool-memory-recall",
        "runtime-tool-message-tool",
@@ -1040,7 +1040,7 @@ describe("qa cli runtime", () => {
              },
            },
          ],
-          counts: { total: 1, passed: 0, failed: 1 },
+          counts: { total: 1, passed: 1, failed: 0 },
          run: {
            providerMode: "mock-openai",
            primaryModel: "openai/gpt-5.5",
@@ -1056,12 +1056,12 @@ describe("qa cli runtime", () => {
        summary: "runtime-summary.json",
      });

-      expect(process.exitCode).toBe(1);
+      expect(process.exitCode).toBeUndefined();
      expect(stdoutWrite).toHaveBeenCalledWith(
        expect.stringContaining("QA runtime parity report:"),
      );
      expect(stdoutWrite).toHaveBeenCalledWith(
-        expect.stringContaining("QA runtime parity verdict: fail"),
+        expect.stringContaining("QA runtime parity verdict: pass"),
      );
    } finally {
      process.exitCode = priorExitCode;
--- a/extensions/qa-lab/src/runtime-parity.test.ts
+++ b/extensions/qa-lab/src/runtime-parity.test.ts
@@ -2,6 +2,7 @@
 import { describe, expect, it } from "vitest";
 import {
  __testing,
+  isRuntimeParityResultPass,
  runRuntimeParityScenario,
  type RuntimeId,
  type RuntimeParityCell,
@@ -95,18 +96,22 @@ describe("runtime parity", () => {
      scenarioId: "matching-tool-errors",
      runCell: async (runtime) => ({
        scenarioStatus: "pass",
-        cell: makeRuntimeParityCell(runtime, [
-          {
-            tool: "web_search",
-            argsHash: "same-args",
-            resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
-            errorClass: "tool-result-error",
-          },
-        ]),
+        cell: {
+          ...makeRuntimeParityCell(runtime, [
+            {
+              tool: "web_search",
+              argsHash: "same-args",
+              resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
+              errorClass: "tool-result-error",
+            },
+          ]),
+          ...(runtime === "codex" ? { runtimeErrorClass: "tool-error" } : {}),
+        },
      }),
    });

    expect(result.drift).toBe("none");
+    expect(isRuntimeParityResultPass(result)).toBe(true);
  });

  it("prefers transcript tool results when mock debug rows are incomplete", () => {
--- a/extensions/qa-lab/src/runtime-parity.ts
+++ b/extensions/qa-lab/src/runtime-parity.ts
@@ -81,8 +81,8 @@ export function runtimeParityCellStatus(
 export function isRuntimeParityResultPass(result: RuntimeParityResult) {
  return (
    result.drift !== "failure-mode" &&
-    runtimeParityCellStatus(result.cells.openclaw) === "pass" &&
-    runtimeParityCellStatus(result.cells.codex) === "pass"
+    isRuntimeParityCellPassable(result.cells.openclaw) &&
+    isRuntimeParityCellPassable(result.cells.codex)
  );
 }

@@ -771,6 +771,13 @@ function isHardFailureRuntimeError(errorClass: string | undefined) {
  );
 }

+function isRuntimeParityCellPassable(cell: RuntimeParityCell | undefined) {
+  if (!cell || cell.transportErrorClass || isHardFailureRuntimeError(cell.runtimeErrorClass)) {
+    return false;
+  }
+  return !cell.runtimeErrorClass || cell.runtimeErrorClass === "tool-error";
+}
+
 function hasMissingToolResult(toolCalls: readonly RuntimeParityToolCall[]) {
  return toolCalls.some((toolCall) => toolCall.errorClass === TOOL_RESULT_MISSING_ERROR_CLASS);
 }
@@ -869,8 +876,8 @@ function classifyRuntimeParityCells(params: {
  if (
    params.openclawScenarioStatus === "fail" ||
    params.codexScenarioStatus === "fail" ||
-    params.openclaw.runtimeErrorClass ||
-    params.codex.runtimeErrorClass
+    !isRuntimeParityCellPassable(params.openclaw) ||
+    !isRuntimeParityCellPassable(params.codex)
  ) {
    return {
      drift: "failure-mode",
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -125,10 +125,12 @@ describe("qa scenario catalog", () => {
    const messageTool = readQaScenarioById("runtime-tool-message-tool");
    const tavilySearch = readQaScenarioById("runtime-tool-tavily-search");
    const webSearch = readQaScenarioById("runtime-tool-web-search");
+    const imageGenerate = readQaScenarioById("runtime-tool-image-generate");

    expect(applyPatch.runtimeParityTier).toBe("standard");
    expect(messageTool.runtimeParityTier).toBe("optional");
    expect(tavilySearch.runtimeParityTier).toBe("optional");
+    expect(imageGenerate.runtimeParityTier).toBe("optional");
    expect(readQaScenarioExecutionConfig(applyPatch.id)).toMatchObject({
      toolName: "apply_patch",
      toolCoverage: {
@@ -155,6 +157,15 @@ describe("qa scenario catalog", () => {
      },
    });
    expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap");
+    expect(readQaScenarioExecutionConfig(imageGenerate.id)).toMatchObject({
+      toolName: "image_generate",
+      toolCoverage: {
+        bucket: "openclaw-dynamic-integration",
+        expectedLayer: "openclaw-dynamic",
+        capabilityLayer: "openclaw-dynamic-direct",
+        required: true,
+      },
+    });
  });

  it("loads the Codex legacy Read vocabulary live parity canary", () => {
--- a/qa/scenarios/runtime/tools/image-generate.md
+++ b/qa/scenarios/runtime/tools/image-generate.md
@@ -4,7 +4,7 @@
 id: runtime-tool-image-generate
 title: Runtime tool fixture — image_generate
 surface: runtime-tools
-runtimeParityTier: standard
+runtimeParityTier: optional
 coverage:
  primary:
    - tools.image-generate
@@ -13,7 +13,7 @@ successCriteria:
  - Effective tools expose image_generate after QA image-generation config is applied.
  - The mock provider plans exactly one happy-path image_generate call.
  - The mock provider plans one denied-input failure-path image_generate call.
-  - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
+  - Runtime parity coverage records async image start/result drift outside the standard direct-loading gate.
 docsRefs:
  - docs/tools/image-generation.md
 codeRefs:
@@ -34,8 +34,8 @@ execution:
      required: true
      codexDefaultImpact: P4
      qaImpact: P1
-      action: hard gate in the standard direct-loading tier
-      reason: image_generate is an OpenClaw integration tool and must stay visible and callable under OpenClaw and Codex direct runtime parity.
+      action: optional runtime parity gate with async image completion coverage
+      reason: image_generate is an OpenClaw integration tool whose happy path yields for async completion, so standard direct call/result parity would compare different lifecycle phases.
    promptSnippet: "target=image_generate"
    failurePromptSnippet: "failure target=image_generate"
 ```