diff --git a/.github/workflows/openclaw-release-checks.yml b/.github/workflows/openclaw-release-checks.yml index cd6df53f093..27dc035ac16 100644 --- a/.github/workflows/openclaw-release-checks.yml +++ b/.github/workflows/openclaw-release-checks.yml @@ -955,6 +955,57 @@ jobs: retention-days: 14 if-no-files-found: warn + runtime_tool_coverage_release_checks: + name: Enforce QA Lab runtime tool coverage + needs: [resolve_target, qa_lab_runtime_parity_release_checks] + if: always() && contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group) + runs-on: ubuntu-24.04 + timeout-minutes: 15 + permissions: + contents: read + actions: read + env: + OPENCLAW_BUILD_PRIVATE_QA: "1" + OPENCLAW_ENABLE_PRIVATE_QA_CLI: "1" + steps: + - name: Checkout selected ref + uses: actions/checkout@v6 + with: + persist-credentials: false + ref: ${{ needs.resolve_target.outputs.revision }} + fetch-depth: 1 + + - name: Setup Node environment + uses: ./.github/actions/setup-node-env + with: + node-version: ${{ env.NODE_VERSION }} + pnpm-version: ${{ env.PNPM_VERSION }} + install-bun: "true" + + - name: Download runtime parity artifacts + uses: actions/download-artifact@v4 + with: + name: release-qa-runtime-parity-${{ needs.resolve_target.outputs.revision }} + path: .artifacts/qa-e2e/ + + - name: Enforce standard runtime tool coverage + run: | + set -euo pipefail + pnpm openclaw qa coverage \ + --repo-root . \ + --tools \ + --summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json \ + --output .artifacts/qa-e2e/runtime-parity-standard-report/qa-runtime-tool-coverage-report.md + + - name: Upload runtime tool coverage artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: release-qa-runtime-tool-coverage-${{ needs.resolve_target.outputs.revision }} + path: .artifacts/qa-e2e/runtime-parity-standard-report/ + retention-days: 14 + if-no-files-found: warn + qa_live_matrix_release_checks: name: Run QA Lab live Matrix lane needs: [resolve_target] @@ -1434,6 +1485,7 @@ jobs: - qa_lab_parity_lane_release_checks - qa_lab_parity_report_release_checks - qa_lab_runtime_parity_release_checks + - runtime_tool_coverage_release_checks - qa_live_matrix_release_checks - qa_live_telegram_release_checks - qa_live_discord_release_checks @@ -1465,6 +1517,7 @@ jobs: "qa_lab_parity_lane_release_checks=${{ needs.qa_lab_parity_lane_release_checks.result }}" \ "qa_lab_parity_report_release_checks=${{ needs.qa_lab_parity_report_release_checks.result }}" \ "qa_lab_runtime_parity_release_checks=${{ needs.qa_lab_runtime_parity_release_checks.result }}" \ + "runtime_tool_coverage_release_checks=${{ needs.runtime_tool_coverage_release_checks.result }}" \ "qa_live_matrix_release_checks=${{ needs.qa_live_matrix_release_checks.result }}" \ "qa_live_telegram_release_checks=${{ needs.qa_live_telegram_release_checks.result }}" \ "qa_live_discord_release_checks=${{ needs.qa_live_discord_release_checks.result }}" \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 47e2095ab07..e91b7750c20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai - QA-Lab: add live-only harness self-health scenarios for plugin hook crashes, manifest contract errors, and WebChat direct-reply self-message routing. (#80323) Thanks @100yenadmin. - QA-Lab: add runtime tool fixture scenarios and coverage reporting for Codex-native workspace tools, OpenClaw dynamic tools, and optional plugin-backed tools. Fixes #80173. Thanks @100yenadmin. - QA-Lab: expose runtime tool fixture coverage through `openclaw qa coverage --tools`, with optional suite-summary evaluation for parity gate artifacts. Thanks @100yenadmin. +- QA-Lab: hard-gate required OpenClaw dynamic runtime-tool drift in the standard Codex-vs-Pi tier with a blocking release-check verifier and publish the tool coverage report artifact. Fixes #80339; refs #80319. Thanks @100yenadmin. - QA-Lab: add the personal-agent approval-denial scenario so the benchmark pack verifies denied local reads stop cleanly without tool progress or fixture leaks. (#83150) Thanks @iFiras-Max1. ### Fixes diff --git a/docs/ci.md b/docs/ci.md index f6fd1923822..821abfe12a4 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -203,7 +203,7 @@ Docker release-path soak; `full` forces soak on. The umbrella records the dispatched child run ids, and the final `Verify full validation` job re-checks current child run conclusions and appends slowest-job tables for each child run. If a child workflow is rerun and turns green, rerun only the parent verifier job to refresh the umbrella result and timing summary. -For recovery, both `Full Release Validation` and `OpenClaw Release Checks` accept `rerun_group`. Use `all` for a release candidate, `ci` for only the normal full CI child, `plugin-prerelease` for only the plugin prerelease child, `release-checks` for every release child, or a narrower group: `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`, `qa-parity`, `qa-live`, or `npm-telegram` on the umbrella. This keeps a failed release box rerun bounded after a focused fix. For one failed cross-OS lane, combine `rerun_group=cross-os` with `cross_os_suite_filter`, for example `windows/packaged-upgrade`; long cross-OS commands emit heartbeat lines and packaged-upgrade summaries include per-phase timings. QA release-check lanes are advisory, so QA-only failures warn but do not block the release-check verifier. +For recovery, both `Full Release Validation` and `OpenClaw Release Checks` accept `rerun_group`. Use `all` for a release candidate, `ci` for only the normal full CI child, `plugin-prerelease` for only the plugin prerelease child, `release-checks` for every release child, or a narrower group: `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`, `qa-parity`, `qa-live`, or `npm-telegram` on the umbrella. This keeps a failed release box rerun bounded after a focused fix. For one failed cross-OS lane, combine `rerun_group=cross-os` with `cross_os_suite_filter`, for example `windows/packaged-upgrade`; long cross-OS commands emit heartbeat lines and packaged-upgrade summaries include per-phase timings. QA release-check lanes are advisory except the standard runtime tool coverage gate, which blocks when required OpenClaw dynamic tools drift or disappear from the standard tier summary. `OpenClaw Release Checks` uses the trusted workflow ref to resolve the selected ref once into a `release-package-under-test` tarball, then passes that artifact to cross-OS checks and Package Acceptance, plus the live/E2E release-path Docker workflow when soak coverage runs. That keeps the package bytes consistent across release boxes and avoids repacking the same candidate in multiple child jobs. diff --git a/docs/reference/RELEASING.md b/docs/reference/RELEASING.md index 4e905bf3a38..bc036ee34d1 100644 --- a/docs/reference/RELEASING.md +++ b/docs/reference/RELEASING.md @@ -442,8 +442,10 @@ Focused `npm-telegram` reruns require `release_package_spec` or `npm_telegram_package_spec`; full/all runs with `release_profile=full` use the release-checks package artifact. Focused cross-OS reruns can add `cross_os_suite_filter=windows/packaged-upgrade` or -another OS/suite filter. QA release-check failures are advisory; a QA-only -failure does not block release validation. +another OS/suite filter. QA release-check failures are advisory except the +standard runtime tool coverage gate, which blocks release validation when +required OpenClaw dynamic tools drift or disappear from the standard tier +summary. ### Vitest diff --git a/docs/reference/full-release-validation.md b/docs/reference/full-release-validation.md index a764b1aec81..af95539b506 100644 --- a/docs/reference/full-release-validation.md +++ b/docs/reference/full-release-validation.md @@ -166,9 +166,10 @@ summaries include per-phase timings for packaged upgrade lanes, and long-running commands print heartbeat lines so a stuck Windows update is visible before the job timeout. -QA release-check lanes are advisory. A QA-only failure is reported as a warning -and does not block the release-check verifier; rerun `rerun_group=qa`, -`qa-parity`, or `qa-live` when you need fresh QA evidence. +QA release-check lanes are advisory except the standard runtime tool coverage +gate. Required OpenClaw dynamic tool drift in the standard tier blocks the +release-check verifier; other QA-only failures are reported as warnings. Rerun +`rerun_group=qa`, `qa-parity`, or `qa-live` when you need fresh QA evidence. ## Evidence to keep diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index 4cb46cad667..cfe49dcc25e 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -979,6 +979,64 @@ describe("qa cli runtime", () => { expectWriteContains(stdoutWrite, "codex-native-workspace"); }); + it("exits nonzero when tool coverage summary has required drift", async () => { + const priorExitCode = process.exitCode; + const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-tool-coverage-")); + try { + await fs.writeFile( + path.join(repoRoot, "runtime-summary.json"), + JSON.stringify({ + scenarios: [ + { + name: "runtime-tool-web-search", + status: "fail", + runtimeParity: { + scenarioId: "runtime-tool-web-search", + drift: "tool-call-shape", + driftDetails: "Codex emitted no web_search call", + cells: { + pi: { + runtime: "pi", + transcriptBytes: "", + toolCalls: [{ tool: "web_search", argsHash: "a", resultHash: "r" }], + finalText: "", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + wallClockMs: 1, + bootStateLines: [], + }, + codex: { + runtime: "codex", + transcriptBytes: "", + toolCalls: [], + finalText: "", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + wallClockMs: 1, + bootStateLines: [], + }, + }, + }, + }, + ], + run: { runtimePair: ["pi", "codex"] }, + }), + "utf8", + ); + + await runQaCoverageReportCommand({ + repoRoot, + tools: true, + summary: "runtime-summary.json", + }); + + expect(process.exitCode).toBe(1); + expectWriteContains(stdoutWrite, "- Verdict: fail"); + expectWriteContains(stdoutWrite, "web-search drift=tool-call-shape"); + } finally { + process.exitCode = priorExitCode; + await fs.rm(repoRoot, { recursive: true, force: true }); + } + }); + it("resolves character eval paths and passes model refs through", async () => { await runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts index 428f3c959f3..a8c605383f7 100644 --- a/extensions/qa-lab/src/cli.runtime.ts +++ b/extensions/qa-lab/src/cli.runtime.ts @@ -769,6 +769,9 @@ export async function runQaCoverageReportCommand(opts: { ? `${JSON.stringify(report, null, 2)}\n` : renderQaToolCoverageMarkdownReport(report); outputLabel = "QA tool coverage report"; + if (summary && !report.pass) { + process.exitCode = 1; + } } else { if (opts.summary?.trim()) { throw new Error("--summary requires --tools."); diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 5791f1139fe..52dc50cfb24 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -120,6 +120,7 @@ describe("qa scenario catalog", () => { const applyPatch = readQaScenarioById("runtime-tool-apply-patch"); const messageTool = readQaScenarioById("runtime-tool-message-tool"); const tavilySearch = readQaScenarioById("runtime-tool-tavily-search"); + const webSearch = readQaScenarioById("runtime-tool-web-search"); expect(applyPatch.runtimeParityTier).toBe("standard"); expect(messageTool.runtimeParityTier).toBe("optional"); @@ -140,6 +141,16 @@ describe("qa scenario catalog", () => { required: false, }, }); + expect(readQaScenarioExecutionConfig(webSearch.id)).toMatchObject({ + toolName: "web_search", + toolCoverage: { + bucket: "openclaw-dynamic-integration", + expectedLayer: "openclaw-dynamic", + capabilityLayer: "openclaw-dynamic-direct", + required: true, + }, + }); + expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap"); }); it("loads the Codex Pi-shaped Read vocabulary live parity canary", () => { diff --git a/extensions/qa-lab/src/tool-coverage-report.test.ts b/extensions/qa-lab/src/tool-coverage-report.test.ts index e450e8080b3..8a2c77f8044 100644 --- a/extensions/qa-lab/src/tool-coverage-report.test.ts +++ b/extensions/qa-lab/src/tool-coverage-report.test.ts @@ -223,6 +223,83 @@ describe("qa tool coverage report", () => { ); }); + it("fails untracked required OpenClaw dynamic tool drift", () => { + const report = buildQaToolCoverageReport({ + scenarios: [ + makeScenario("tool-web-search", "web-search", { + toolName: "web_search", + toolCoverage: { + bucket: "openclaw-dynamic-integration", + expectedLayer: "openclaw-dynamic", + capabilityLayer: "openclaw-dynamic-direct", + required: true, + }, + }), + ], + summary: { + scenarios: [ + { + name: "tool web_search", + status: "fail", + runtimeParity: { + scenarioId: "tool-web-search", + drift: "tool-call-shape", + driftDetails: "Codex emitted no web_search call", + cells: { + pi: { + runtime: "pi", + transcriptBytes: "", + toolCalls: [{ tool: "web_search", argsHash: "a", resultHash: "r" }], + finalText: "", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + wallClockMs: 1, + bootStateLines: [], + }, + codex: { + runtime: "codex", + transcriptBytes: "", + toolCalls: [], + finalText: "", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + wallClockMs: 1, + bootStateLines: [], + }, + }, + }, + }, + ], + }, + generatedAt: "2026-05-10T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.failures).toEqual([ + "web-search drift=tool-call-shape (Codex emitted no web_search call)", + ]); + }); + + it("fails untracked required tools missing from an evaluated summary", () => { + const report = buildQaToolCoverageReport({ + scenarios: [ + makeScenario("tool-web-search", "web-search", { + toolCoverage: { + bucket: "openclaw-dynamic-integration", + expectedLayer: "openclaw-dynamic", + capabilityLayer: "openclaw-dynamic-direct", + required: true, + }, + }), + ], + summary: { + scenarios: [], + }, + generatedAt: "2026-05-10T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.failures).toEqual(["web-search drift=not-run"]); + }); + it("rejects unknown runtime tool coverage buckets", () => { expect(() => buildQaToolCoverageReport({ @@ -301,5 +378,13 @@ describe("qa tool coverage report", () => { "#80173 Tavily tools are listed in the phase matrix but are not exposed by the current default tool surface.", }), ); + expect(report.rows.find((row) => row.tool === "web-search")).toEqual( + expect.objectContaining({ + bucket: "openclaw-dynamic-integration", + capabilityLayer: "openclaw-dynamic-direct", + required: true, + }), + ); + expect(report.rows.find((row) => row.tool === "web-search")?.tracking).toBeUndefined(); }); }); diff --git a/extensions/qa-lab/src/tool-coverage-report.ts b/extensions/qa-lab/src/tool-coverage-report.ts index ee1a94195e8..12c17762bb2 100644 --- a/extensions/qa-lab/src/tool-coverage-report.ts +++ b/extensions/qa-lab/src/tool-coverage-report.ts @@ -71,7 +71,7 @@ type ToolFixtureGroup = { scenarios: QaSeedScenarioWithSource[]; }; -const PASSING_DRIFTS: ReadonlySet = new Set(["none", "text-only", "not-run"]); +const PASSING_DRIFTS: ReadonlySet = new Set(["none", "text-only"]); function isRecord(value: unknown): value is Record { return Boolean(value) && typeof value === "object" && !Array.isArray(value); @@ -173,6 +173,10 @@ function mergeScenarioResults( return failingResult; } +function isPassingToolCoverageDrift(drift: QaToolCoverageDrift, evaluated: boolean) { + return PASSING_DRIFTS.has(drift) || (!evaluated && drift === "not-run"); +} + function buildRow(params: { group: ToolFixtureGroup; results: ReadonlyMap; @@ -222,7 +226,9 @@ export function buildQaToolCoverageReport(params: { const evaluated = Boolean(params.summary); const failures = evaluated ? rows - .filter((row) => row.required && !row.tracking && !PASSING_DRIFTS.has(row.drift)) + .filter( + (row) => row.required && !row.tracking && !isPassingToolCoverageDrift(row.drift, true), + ) .map((row) => `${row.tool} drift=${row.drift}${row.details ? ` (${row.details})` : ""}`) : []; return { @@ -237,7 +243,9 @@ export function buildQaToolCoverageReport(params: { dynamicIntegrationTools: rows.filter((row) => row.bucket === "openclaw-dynamic-integration") .length, optionalTools: rows.filter((row) => row.bucket === "optional-profile-or-plugin").length, - passingTools: evaluated ? rows.filter((row) => PASSING_DRIFTS.has(row.drift)).length : 0, + passingTools: evaluated + ? rows.filter((row) => isPassingToolCoverageDrift(row.drift, true)).length + : 0, failingTools: failures.length, rows, pass: failures.length === 0, diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 7c6428b1491..26b5f6a81dd 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -28,7 +28,10 @@ Coverage tracking: Runtime parity tiers: - `standard`: required Codex-vs-Pi mock gate coverage for first-hour depth and - default runtime-tool fixtures; selected with + default runtime-tool fixtures. OpenClaw dynamic integration tools in this + tier are hard-gated by `openclaw qa coverage --tools --summary`; Codex-native + workspace rows remain separately tracked until native/live behavior is the + asserted surface. Selected with `openclaw qa suite --runtime-pair pi,codex --runtime-parity-tier standard` - `optional`: profile-, plugin-, or external-service-dependent runtime-tool fixtures that stay out of the default release gate diff --git a/qa/scenarios/runtime/tools/image-generate.md b/qa/scenarios/runtime/tools/image-generate.md index 67184ac2352..cce41c035a0 100644 --- a/qa/scenarios/runtime/tools/image-generate.md +++ b/qa/scenarios/runtime/tools/image-generate.md @@ -13,6 +13,7 @@ successCriteria: - Effective tools expose image_generate after QA image-generation config is applied. - The mock provider plans exactly one happy-path image_generate call. - The mock provider plans one denied-input failure-path image_generate call. + - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. docsRefs: - docs/tools/image-generation.md codeRefs: @@ -29,15 +30,12 @@ execution: actualTool: image_generate bucket: openclaw-dynamic-integration expectedLayer: openclaw-dynamic + capabilityLayer: openclaw-dynamic-direct required: true - tracking: "#80319" codexDefaultImpact: P4 qaImpact: P1 - action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior - reason: image_generate is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture. - knownHarnessGap: - issue: "#80319" - reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture. + action: hard gate in the standard direct-loading tier + reason: image_generate is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity. promptSnippet: "target=image_generate" failurePromptSnippet: "failure target=image_generate" ``` diff --git a/qa/scenarios/runtime/tools/session-status.md b/qa/scenarios/runtime/tools/session-status.md index cd9a8f16375..1bc27644244 100644 --- a/qa/scenarios/runtime/tools/session-status.md +++ b/qa/scenarios/runtime/tools/session-status.md @@ -13,6 +13,7 @@ successCriteria: - Effective tools expose session_status. - The mock provider plans exactly one happy-path session_status call. - The mock provider plans one denied-input failure-path session_status call. + - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. docsRefs: - qa/scenarios/index.md codeRefs: @@ -28,15 +29,12 @@ execution: actualTool: session_status bucket: openclaw-dynamic-integration expectedLayer: openclaw-dynamic + capabilityLayer: openclaw-dynamic-direct required: true - tracking: "#80319" codexDefaultImpact: P4 qaImpact: P1 - action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior - reason: session_status is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture. - knownHarnessGap: - issue: "#80319" - reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture. + action: hard gate in the standard direct-loading tier + reason: session_status is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity. promptSnippet: "target=session_status" failurePromptSnippet: "failure target=session_status" ``` diff --git a/qa/scenarios/runtime/tools/sessions-spawn.md b/qa/scenarios/runtime/tools/sessions-spawn.md index 97af86dc6b0..ae05fbd0b07 100644 --- a/qa/scenarios/runtime/tools/sessions-spawn.md +++ b/qa/scenarios/runtime/tools/sessions-spawn.md @@ -13,6 +13,7 @@ successCriteria: - Effective tools expose sessions_spawn. - The mock provider plans exactly one happy-path sessions_spawn call. - The mock provider plans one denied-input failure-path sessions_spawn call. + - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. docsRefs: - qa/scenarios/index.md codeRefs: @@ -28,15 +29,12 @@ execution: actualTool: sessions_spawn bucket: openclaw-dynamic-integration expectedLayer: openclaw-dynamic + capabilityLayer: openclaw-dynamic-direct required: true - tracking: "#80319" codexDefaultImpact: P4 qaImpact: P1 - action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior - reason: sessions_spawn is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture. - knownHarnessGap: - issue: "#80319" - reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture. + action: hard gate in the standard direct-loading tier + reason: sessions_spawn is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity. promptSnippet: "target=sessions_spawn" failurePromptSnippet: "failure target=sessions_spawn" ``` diff --git a/qa/scenarios/runtime/tools/web-fetch.md b/qa/scenarios/runtime/tools/web-fetch.md index 32612ced4fa..5ed2e353984 100644 --- a/qa/scenarios/runtime/tools/web-fetch.md +++ b/qa/scenarios/runtime/tools/web-fetch.md @@ -13,6 +13,7 @@ successCriteria: - Effective tools expose web_fetch. - The mock provider plans exactly one happy-path web_fetch call. - The mock provider plans one denied-input failure-path web_fetch call. + - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. docsRefs: - qa/scenarios/index.md codeRefs: @@ -28,15 +29,12 @@ execution: actualTool: web_fetch bucket: openclaw-dynamic-integration expectedLayer: openclaw-dynamic + capabilityLayer: openclaw-dynamic-direct required: true - tracking: "#80319" codexDefaultImpact: P4 qaImpact: P1 - action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior - reason: web_fetch is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture. - knownHarnessGap: - issue: "#80319" - reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture. + action: hard gate in the standard direct-loading tier + reason: web_fetch is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity. promptSnippet: "target=web_fetch" failurePromptSnippet: "failure target=web_fetch" ``` diff --git a/qa/scenarios/runtime/tools/web-search.md b/qa/scenarios/runtime/tools/web-search.md index 4defbe790ee..5712ada13c4 100644 --- a/qa/scenarios/runtime/tools/web-search.md +++ b/qa/scenarios/runtime/tools/web-search.md @@ -13,6 +13,7 @@ successCriteria: - Effective tools expose web_search. - The mock provider plans exactly one happy-path web_search call. - The mock provider plans one denied-input failure-path web_search call. + - Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate. docsRefs: - qa/scenarios/index.md codeRefs: @@ -28,15 +29,12 @@ execution: actualTool: web_search bucket: openclaw-dynamic-integration expectedLayer: openclaw-dynamic + capabilityLayer: openclaw-dynamic-direct required: true - tracking: "#80319" codexDefaultImpact: P4 qaImpact: P1 - action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior - reason: web_search is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture. - knownHarnessGap: - issue: "#80319" - reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture. + action: hard gate in the standard direct-loading tier + reason: web_search is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity. promptSnippet: "target=web_search" failurePromptSnippet: "failure target=web_search" ``` diff --git a/scripts/qa-coverage-report.ts b/scripts/qa-coverage-report.ts index dfe9bcba67e..6858c6edc6b 100644 --- a/scripts/qa-coverage-report.ts +++ b/scripts/qa-coverage-report.ts @@ -4,6 +4,8 @@ type Options = { json?: boolean; output?: string; repoRoot?: string; + summary?: string; + tools?: boolean; }; function takeValue(args: string[], index: number, flag: string): string { @@ -27,6 +29,8 @@ Options: --json Print machine-readable JSON --output Write the report to a file --repo-root Repository root to target + --summary Runtime qa-suite-summary.json to overlay on --tools coverage + --tools Print runtime tool fixture coverage instead of scenario coverage -h, --help Display help `); process.exit(0); @@ -41,6 +45,13 @@ Options: opts.repoRoot = takeValue(args, index, arg); index += 1; break; + case "--summary": + opts.summary = takeValue(args, index, arg); + index += 1; + break; + case "--tools": + opts.tools = true; + break; default: throw new Error(`Unknown qa coverage option: ${arg}`); } @@ -53,4 +64,6 @@ await runQaCoverageReportCommand({ ...(opts.json ? { json: true } : {}), ...(opts.output ? { output: opts.output } : {}), ...(opts.repoRoot ? { repoRoot: opts.repoRoot } : {}), + ...(opts.summary ? { summary: opts.summary } : {}), + ...(opts.tools ? { tools: true } : {}), }); diff --git a/src/infra/run-node.test.ts b/src/infra/run-node.test.ts index 8c8e553b052..1245f60a871 100644 --- a/src/infra/run-node.test.ts +++ b/src/infra/run-node.test.ts @@ -1093,7 +1093,14 @@ describe("run-node script", () => { const exitCode = await runNodeMain({ cwd: tmp, - args: ["qa", "coverage", "--json"], + args: [ + "qa", + "coverage", + "--json", + "--tools", + "--summary", + ".artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json", + ], env: { ...process.env, OPENCLAW_RUNNER_LOG: "0", @@ -1111,6 +1118,9 @@ describe("run-node script", () => { "tsx", path.join(tmp, "scripts", "qa-coverage-report.ts"), "--json", + "--tools", + "--summary", + ".artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json", ], ]); }); diff --git a/test/scripts/plugin-prerelease-test-plan.test.ts b/test/scripts/plugin-prerelease-test-plan.test.ts index 8df271b4632..810cf5a4da6 100644 --- a/test/scripts/plugin-prerelease-test-plan.test.ts +++ b/test/scripts/plugin-prerelease-test-plan.test.ts @@ -490,6 +490,45 @@ describe("scripts/lib/plugin-prerelease-test-plan.mjs", () => { } }); + it("keeps runtime tool coverage blocking in release checks", () => { + const releaseChecksSource = readFileSync( + ".github/workflows/openclaw-release-checks.yml", + "utf8", + ); + const releaseChecksWorkflow = parse(releaseChecksSource); + const runtimeToolCoverage = releaseChecksWorkflow.jobs.runtime_tool_coverage_release_checks; + + expect(runtimeToolCoverage["continue-on-error"]).toBeUndefined(); + expect(runtimeToolCoverage.needs).toEqual([ + "resolve_target", + "qa_lab_runtime_parity_release_checks", + ]); + expect(runtimeToolCoverage.steps).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: "Enforce standard runtime tool coverage", + run: expect.stringContaining("pnpm openclaw qa coverage"), + }), + ]), + ); + expect(runtimeToolCoverage.steps).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: "Enforce standard runtime tool coverage", + run: expect.stringContaining( + "--summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json", + ), + }), + ]), + ); + expect(releaseChecksWorkflow.jobs.summary.needs).toContain( + "runtime_tool_coverage_release_checks", + ); + expect(releaseChecksSource).toContain( + '"runtime_tool_coverage_release_checks=${{ needs.runtime_tool_coverage_release_checks.result }}"', + ); + }); + it("keeps the live-ish availability check redacted", () => { const output = execFileSync( process.execPath,