mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-22 17:28:11 +00:00
test(release): stabilize qa runtime parity gate
This commit is contained in:
@@ -889,14 +889,14 @@ status=done`,
|
||||
expect(report.failures).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails runtime parity reports when a runtime cell fails", () => {
|
||||
it("fails runtime parity reports when a runtime cell has a hard failure", () => {
|
||||
const summary = makeRuntimeParitySummary();
|
||||
const scenario = summary.scenarios[1];
|
||||
if (!scenario?.runtimeParity) {
|
||||
throw new Error("runtime parity fixture missing");
|
||||
}
|
||||
scenario.status = "fail";
|
||||
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
|
||||
scenario.runtimeParity.cells.codex.runtimeErrorClass = "auth";
|
||||
|
||||
const report = buildQaRuntimeParityReport({
|
||||
summary,
|
||||
@@ -910,6 +910,24 @@ status=done`,
|
||||
);
|
||||
});
|
||||
|
||||
it("passes runtime parity reports with controlled tool-error cells and advisory drift", () => {
|
||||
const summary = makeRuntimeParitySummary();
|
||||
const scenario = summary.scenarios[1];
|
||||
if (!scenario?.runtimeParity) {
|
||||
throw new Error("runtime parity fixture missing");
|
||||
}
|
||||
scenario.runtimeParity.cells.codex.runtimeErrorClass = "tool-error";
|
||||
|
||||
const report = buildQaRuntimeParityReport({
|
||||
summary,
|
||||
comparedAt: "2026-05-10T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.failedScenarios).toBe(0);
|
||||
expect(report.failures).toEqual([]);
|
||||
});
|
||||
|
||||
it("fails live runtime parity reports when assistant-message usage is missing", () => {
|
||||
const summary = makeRuntimeParitySummary();
|
||||
summary.run = {
|
||||
|
||||
@@ -906,7 +906,6 @@ describe("qa cli runtime", () => {
|
||||
"runtime-tool-fs-read",
|
||||
"runtime-tool-fs-write",
|
||||
"runtime-tool-grep",
|
||||
"runtime-tool-image-generate",
|
||||
"runtime-tool-session-status",
|
||||
"runtime-tool-sessions-spawn",
|
||||
"runtime-tool-web-fetch",
|
||||
@@ -924,6 +923,7 @@ describe("qa cli runtime", () => {
|
||||
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
|
||||
scenarioIds: [
|
||||
"runtime-soak-100-turn",
|
||||
"runtime-tool-image-generate",
|
||||
"runtime-tool-memory-add",
|
||||
"runtime-tool-memory-recall",
|
||||
"runtime-tool-message-tool",
|
||||
@@ -1040,7 +1040,7 @@ describe("qa cli runtime", () => {
|
||||
},
|
||||
},
|
||||
],
|
||||
counts: { total: 1, passed: 0, failed: 1 },
|
||||
counts: { total: 1, passed: 1, failed: 0 },
|
||||
run: {
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
@@ -1056,12 +1056,12 @@ describe("qa cli runtime", () => {
|
||||
summary: "runtime-summary.json",
|
||||
});
|
||||
|
||||
expect(process.exitCode).toBe(1);
|
||||
expect(process.exitCode).toBeUndefined();
|
||||
expect(stdoutWrite).toHaveBeenCalledWith(
|
||||
expect.stringContaining("QA runtime parity report:"),
|
||||
);
|
||||
expect(stdoutWrite).toHaveBeenCalledWith(
|
||||
expect.stringContaining("QA runtime parity verdict: fail"),
|
||||
expect.stringContaining("QA runtime parity verdict: pass"),
|
||||
);
|
||||
} finally {
|
||||
process.exitCode = priorExitCode;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
__testing,
|
||||
isRuntimeParityResultPass,
|
||||
runRuntimeParityScenario,
|
||||
type RuntimeId,
|
||||
type RuntimeParityCell,
|
||||
@@ -95,18 +96,22 @@ describe("runtime parity", () => {
|
||||
scenarioId: "matching-tool-errors",
|
||||
runCell: async (runtime) => ({
|
||||
scenarioStatus: "pass",
|
||||
cell: makeRuntimeParityCell(runtime, [
|
||||
{
|
||||
tool: "web_search",
|
||||
argsHash: "same-args",
|
||||
resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
|
||||
errorClass: "tool-result-error",
|
||||
},
|
||||
]),
|
||||
cell: {
|
||||
...makeRuntimeParityCell(runtime, [
|
||||
{
|
||||
tool: "web_search",
|
||||
argsHash: "same-args",
|
||||
resultHash: runtime === "openclaw" ? "validation-error" : "provider-error",
|
||||
errorClass: "tool-result-error",
|
||||
},
|
||||
]),
|
||||
...(runtime === "codex" ? { runtimeErrorClass: "tool-error" } : {}),
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
expect(result.drift).toBe("none");
|
||||
expect(isRuntimeParityResultPass(result)).toBe(true);
|
||||
});
|
||||
|
||||
it("prefers transcript tool results when mock debug rows are incomplete", () => {
|
||||
|
||||
@@ -81,8 +81,8 @@ export function runtimeParityCellStatus(
|
||||
export function isRuntimeParityResultPass(result: RuntimeParityResult) {
|
||||
return (
|
||||
result.drift !== "failure-mode" &&
|
||||
runtimeParityCellStatus(result.cells.openclaw) === "pass" &&
|
||||
runtimeParityCellStatus(result.cells.codex) === "pass"
|
||||
isRuntimeParityCellPassable(result.cells.openclaw) &&
|
||||
isRuntimeParityCellPassable(result.cells.codex)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -771,6 +771,13 @@ function isHardFailureRuntimeError(errorClass: string | undefined) {
|
||||
);
|
||||
}
|
||||
|
||||
function isRuntimeParityCellPassable(cell: RuntimeParityCell | undefined) {
|
||||
if (!cell || cell.transportErrorClass || isHardFailureRuntimeError(cell.runtimeErrorClass)) {
|
||||
return false;
|
||||
}
|
||||
return !cell.runtimeErrorClass || cell.runtimeErrorClass === "tool-error";
|
||||
}
|
||||
|
||||
function hasMissingToolResult(toolCalls: readonly RuntimeParityToolCall[]) {
|
||||
return toolCalls.some((toolCall) => toolCall.errorClass === TOOL_RESULT_MISSING_ERROR_CLASS);
|
||||
}
|
||||
@@ -869,8 +876,8 @@ function classifyRuntimeParityCells(params: {
|
||||
if (
|
||||
params.openclawScenarioStatus === "fail" ||
|
||||
params.codexScenarioStatus === "fail" ||
|
||||
params.openclaw.runtimeErrorClass ||
|
||||
params.codex.runtimeErrorClass
|
||||
!isRuntimeParityCellPassable(params.openclaw) ||
|
||||
!isRuntimeParityCellPassable(params.codex)
|
||||
) {
|
||||
return {
|
||||
drift: "failure-mode",
|
||||
|
||||
@@ -125,10 +125,12 @@ describe("qa scenario catalog", () => {
|
||||
const messageTool = readQaScenarioById("runtime-tool-message-tool");
|
||||
const tavilySearch = readQaScenarioById("runtime-tool-tavily-search");
|
||||
const webSearch = readQaScenarioById("runtime-tool-web-search");
|
||||
const imageGenerate = readQaScenarioById("runtime-tool-image-generate");
|
||||
|
||||
expect(applyPatch.runtimeParityTier).toBe("standard");
|
||||
expect(messageTool.runtimeParityTier).toBe("optional");
|
||||
expect(tavilySearch.runtimeParityTier).toBe("optional");
|
||||
expect(imageGenerate.runtimeParityTier).toBe("optional");
|
||||
expect(readQaScenarioExecutionConfig(applyPatch.id)).toMatchObject({
|
||||
toolName: "apply_patch",
|
||||
toolCoverage: {
|
||||
@@ -155,6 +157,15 @@ describe("qa scenario catalog", () => {
|
||||
},
|
||||
});
|
||||
expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap");
|
||||
expect(readQaScenarioExecutionConfig(imageGenerate.id)).toMatchObject({
|
||||
toolName: "image_generate",
|
||||
toolCoverage: {
|
||||
bucket: "openclaw-dynamic-integration",
|
||||
expectedLayer: "openclaw-dynamic",
|
||||
capabilityLayer: "openclaw-dynamic-direct",
|
||||
required: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("loads the Codex legacy Read vocabulary live parity canary", () => {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
id: runtime-tool-image-generate
|
||||
title: Runtime tool fixture — image_generate
|
||||
surface: runtime-tools
|
||||
runtimeParityTier: standard
|
||||
runtimeParityTier: optional
|
||||
coverage:
|
||||
primary:
|
||||
- tools.image-generate
|
||||
@@ -13,7 +13,7 @@ successCriteria:
|
||||
- Effective tools expose image_generate after QA image-generation config is applied.
|
||||
- The mock provider plans exactly one happy-path image_generate call.
|
||||
- The mock provider plans one denied-input failure-path image_generate call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
- Runtime parity coverage records async image start/result drift outside the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
codeRefs:
|
||||
@@ -34,8 +34,8 @@ execution:
|
||||
required: true
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: image_generate is an OpenClaw integration tool and must stay visible and callable under OpenClaw and Codex direct runtime parity.
|
||||
action: optional runtime parity gate with async image completion coverage
|
||||
reason: image_generate is an OpenClaw integration tool whose happy path yields for async completion, so standard direct call/result parity would compare different lifecycle phases.
|
||||
promptSnippet: "target=image_generate"
|
||||
failurePromptSnippet: "failure target=image_generate"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user