mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-24 23:43:03 +00:00
fix(qa-lab): hard gate runtime tool coverage
This commit is contained in:
53
.github/workflows/openclaw-release-checks.yml
vendored
53
.github/workflows/openclaw-release-checks.yml
vendored
@@ -955,6 +955,57 @@ jobs:
|
||||
retention-days: 14
|
||||
if-no-files-found: warn
|
||||
|
||||
runtime_tool_coverage_release_checks:
|
||||
name: Enforce QA Lab runtime tool coverage
|
||||
needs: [resolve_target, qa_lab_runtime_parity_release_checks]
|
||||
if: always() && contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group)
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 15
|
||||
permissions:
|
||||
contents: read
|
||||
actions: read
|
||||
env:
|
||||
OPENCLAW_BUILD_PRIVATE_QA: "1"
|
||||
OPENCLAW_ENABLE_PRIVATE_QA_CLI: "1"
|
||||
steps:
|
||||
- name: Checkout selected ref
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
ref: ${{ needs.resolve_target.outputs.revision }}
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Setup Node environment
|
||||
uses: ./.github/actions/setup-node-env
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
pnpm-version: ${{ env.PNPM_VERSION }}
|
||||
install-bun: "true"
|
||||
|
||||
- name: Download runtime parity artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: release-qa-runtime-parity-${{ needs.resolve_target.outputs.revision }}
|
||||
path: .artifacts/qa-e2e/
|
||||
|
||||
- name: Enforce standard runtime tool coverage
|
||||
run: |
|
||||
set -euo pipefail
|
||||
pnpm openclaw qa coverage \
|
||||
--repo-root . \
|
||||
--tools \
|
||||
--summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json \
|
||||
--output .artifacts/qa-e2e/runtime-parity-standard-report/qa-runtime-tool-coverage-report.md
|
||||
|
||||
- name: Upload runtime tool coverage artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: release-qa-runtime-tool-coverage-${{ needs.resolve_target.outputs.revision }}
|
||||
path: .artifacts/qa-e2e/runtime-parity-standard-report/
|
||||
retention-days: 14
|
||||
if-no-files-found: warn
|
||||
|
||||
qa_live_matrix_release_checks:
|
||||
name: Run QA Lab live Matrix lane
|
||||
needs: [resolve_target]
|
||||
@@ -1434,6 +1485,7 @@ jobs:
|
||||
- qa_lab_parity_lane_release_checks
|
||||
- qa_lab_parity_report_release_checks
|
||||
- qa_lab_runtime_parity_release_checks
|
||||
- runtime_tool_coverage_release_checks
|
||||
- qa_live_matrix_release_checks
|
||||
- qa_live_telegram_release_checks
|
||||
- qa_live_discord_release_checks
|
||||
@@ -1465,6 +1517,7 @@ jobs:
|
||||
"qa_lab_parity_lane_release_checks=${{ needs.qa_lab_parity_lane_release_checks.result }}" \
|
||||
"qa_lab_parity_report_release_checks=${{ needs.qa_lab_parity_report_release_checks.result }}" \
|
||||
"qa_lab_runtime_parity_release_checks=${{ needs.qa_lab_runtime_parity_release_checks.result }}" \
|
||||
"runtime_tool_coverage_release_checks=${{ needs.runtime_tool_coverage_release_checks.result }}" \
|
||||
"qa_live_matrix_release_checks=${{ needs.qa_live_matrix_release_checks.result }}" \
|
||||
"qa_live_telegram_release_checks=${{ needs.qa_live_telegram_release_checks.result }}" \
|
||||
"qa_live_discord_release_checks=${{ needs.qa_live_discord_release_checks.result }}" \
|
||||
|
||||
@@ -24,6 +24,7 @@ Docs: https://docs.openclaw.ai
|
||||
- QA-Lab: add live-only harness self-health scenarios for plugin hook crashes, manifest contract errors, and WebChat direct-reply self-message routing. (#80323) Thanks @100yenadmin.
|
||||
- QA-Lab: add runtime tool fixture scenarios and coverage reporting for Codex-native workspace tools, OpenClaw dynamic tools, and optional plugin-backed tools. Fixes #80173. Thanks @100yenadmin.
|
||||
- QA-Lab: expose runtime tool fixture coverage through `openclaw qa coverage --tools`, with optional suite-summary evaluation for parity gate artifacts. Thanks @100yenadmin.
|
||||
- QA-Lab: hard-gate required OpenClaw dynamic runtime-tool drift in the standard Codex-vs-Pi tier with a blocking release-check verifier and publish the tool coverage report artifact. Fixes #80339; refs #80319. Thanks @100yenadmin.
|
||||
- QA-Lab: add the personal-agent approval-denial scenario so the benchmark pack verifies denied local reads stop cleanly without tool progress or fixture leaks. (#83150) Thanks @iFiras-Max1.
|
||||
|
||||
### Fixes
|
||||
|
||||
@@ -203,7 +203,7 @@ Docker release-path soak; `full` forces soak on.
|
||||
|
||||
The umbrella records the dispatched child run ids, and the final `Verify full validation` job re-checks current child run conclusions and appends slowest-job tables for each child run. If a child workflow is rerun and turns green, rerun only the parent verifier job to refresh the umbrella result and timing summary.
|
||||
|
||||
For recovery, both `Full Release Validation` and `OpenClaw Release Checks` accept `rerun_group`. Use `all` for a release candidate, `ci` for only the normal full CI child, `plugin-prerelease` for only the plugin prerelease child, `release-checks` for every release child, or a narrower group: `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`, `qa-parity`, `qa-live`, or `npm-telegram` on the umbrella. This keeps a failed release box rerun bounded after a focused fix. For one failed cross-OS lane, combine `rerun_group=cross-os` with `cross_os_suite_filter`, for example `windows/packaged-upgrade`; long cross-OS commands emit heartbeat lines and packaged-upgrade summaries include per-phase timings. QA release-check lanes are advisory, so QA-only failures warn but do not block the release-check verifier.
|
||||
For recovery, both `Full Release Validation` and `OpenClaw Release Checks` accept `rerun_group`. Use `all` for a release candidate, `ci` for only the normal full CI child, `plugin-prerelease` for only the plugin prerelease child, `release-checks` for every release child, or a narrower group: `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`, `qa-parity`, `qa-live`, or `npm-telegram` on the umbrella. This keeps a failed release box rerun bounded after a focused fix. For one failed cross-OS lane, combine `rerun_group=cross-os` with `cross_os_suite_filter`, for example `windows/packaged-upgrade`; long cross-OS commands emit heartbeat lines and packaged-upgrade summaries include per-phase timings. QA release-check lanes are advisory except the standard runtime tool coverage gate, which blocks when required OpenClaw dynamic tools drift or disappear from the standard tier summary.
|
||||
|
||||
`OpenClaw Release Checks` uses the trusted workflow ref to resolve the selected ref once into a `release-package-under-test` tarball, then passes that artifact to cross-OS checks and Package Acceptance, plus the live/E2E release-path Docker workflow when soak coverage runs. That keeps the package bytes consistent across release boxes and avoids repacking the same candidate in multiple child jobs.
|
||||
|
||||
|
||||
@@ -442,8 +442,10 @@ Focused `npm-telegram` reruns require `release_package_spec` or
|
||||
`npm_telegram_package_spec`; full/all runs with `release_profile=full` use the
|
||||
release-checks package artifact. Focused
|
||||
cross-OS reruns can add `cross_os_suite_filter=windows/packaged-upgrade` or
|
||||
another OS/suite filter. QA release-check failures are advisory; a QA-only
|
||||
failure does not block release validation.
|
||||
another OS/suite filter. QA release-check failures are advisory except the
|
||||
standard runtime tool coverage gate, which blocks release validation when
|
||||
required OpenClaw dynamic tools drift or disappear from the standard tier
|
||||
summary.
|
||||
|
||||
### Vitest
|
||||
|
||||
|
||||
@@ -166,9 +166,10 @@ summaries include per-phase timings for packaged upgrade lanes, and long-running
|
||||
commands print heartbeat lines so a stuck Windows update is visible before the
|
||||
job timeout.
|
||||
|
||||
QA release-check lanes are advisory. A QA-only failure is reported as a warning
|
||||
and does not block the release-check verifier; rerun `rerun_group=qa`,
|
||||
`qa-parity`, or `qa-live` when you need fresh QA evidence.
|
||||
QA release-check lanes are advisory except the standard runtime tool coverage
|
||||
gate. Required OpenClaw dynamic tool drift in the standard tier blocks the
|
||||
release-check verifier; other QA-only failures are reported as warnings. Rerun
|
||||
`rerun_group=qa`, `qa-parity`, or `qa-live` when you need fresh QA evidence.
|
||||
|
||||
## Evidence to keep
|
||||
|
||||
|
||||
@@ -979,6 +979,64 @@ describe("qa cli runtime", () => {
|
||||
expectWriteContains(stdoutWrite, "codex-native-workspace");
|
||||
});
|
||||
|
||||
it("exits nonzero when tool coverage summary has required drift", async () => {
|
||||
const priorExitCode = process.exitCode;
|
||||
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-tool-coverage-"));
|
||||
try {
|
||||
await fs.writeFile(
|
||||
path.join(repoRoot, "runtime-summary.json"),
|
||||
JSON.stringify({
|
||||
scenarios: [
|
||||
{
|
||||
name: "runtime-tool-web-search",
|
||||
status: "fail",
|
||||
runtimeParity: {
|
||||
scenarioId: "runtime-tool-web-search",
|
||||
drift: "tool-call-shape",
|
||||
driftDetails: "Codex emitted no web_search call",
|
||||
cells: {
|
||||
pi: {
|
||||
runtime: "pi",
|
||||
transcriptBytes: "",
|
||||
toolCalls: [{ tool: "web_search", argsHash: "a", resultHash: "r" }],
|
||||
finalText: "",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
},
|
||||
codex: {
|
||||
runtime: "codex",
|
||||
transcriptBytes: "",
|
||||
toolCalls: [],
|
||||
finalText: "",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
run: { runtimePair: ["pi", "codex"] },
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await runQaCoverageReportCommand({
|
||||
repoRoot,
|
||||
tools: true,
|
||||
summary: "runtime-summary.json",
|
||||
});
|
||||
|
||||
expect(process.exitCode).toBe(1);
|
||||
expectWriteContains(stdoutWrite, "- Verdict: fail");
|
||||
expectWriteContains(stdoutWrite, "web-search drift=tool-call-shape");
|
||||
} finally {
|
||||
process.exitCode = priorExitCode;
|
||||
await fs.rm(repoRoot, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("resolves character eval paths and passes model refs through", async () => {
|
||||
await runQaCharacterEvalCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
|
||||
@@ -769,6 +769,9 @@ export async function runQaCoverageReportCommand(opts: {
|
||||
? `${JSON.stringify(report, null, 2)}\n`
|
||||
: renderQaToolCoverageMarkdownReport(report);
|
||||
outputLabel = "QA tool coverage report";
|
||||
if (summary && !report.pass) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
} else {
|
||||
if (opts.summary?.trim()) {
|
||||
throw new Error("--summary requires --tools.");
|
||||
|
||||
@@ -120,6 +120,7 @@ describe("qa scenario catalog", () => {
|
||||
const applyPatch = readQaScenarioById("runtime-tool-apply-patch");
|
||||
const messageTool = readQaScenarioById("runtime-tool-message-tool");
|
||||
const tavilySearch = readQaScenarioById("runtime-tool-tavily-search");
|
||||
const webSearch = readQaScenarioById("runtime-tool-web-search");
|
||||
|
||||
expect(applyPatch.runtimeParityTier).toBe("standard");
|
||||
expect(messageTool.runtimeParityTier).toBe("optional");
|
||||
@@ -140,6 +141,16 @@ describe("qa scenario catalog", () => {
|
||||
required: false,
|
||||
},
|
||||
});
|
||||
expect(readQaScenarioExecutionConfig(webSearch.id)).toMatchObject({
|
||||
toolName: "web_search",
|
||||
toolCoverage: {
|
||||
bucket: "openclaw-dynamic-integration",
|
||||
expectedLayer: "openclaw-dynamic",
|
||||
capabilityLayer: "openclaw-dynamic-direct",
|
||||
required: true,
|
||||
},
|
||||
});
|
||||
expect(readQaScenarioExecutionConfig(webSearch.id)).not.toHaveProperty("knownHarnessGap");
|
||||
});
|
||||
|
||||
it("loads the Codex Pi-shaped Read vocabulary live parity canary", () => {
|
||||
|
||||
@@ -223,6 +223,83 @@ describe("qa tool coverage report", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("fails untracked required OpenClaw dynamic tool drift", () => {
|
||||
const report = buildQaToolCoverageReport({
|
||||
scenarios: [
|
||||
makeScenario("tool-web-search", "web-search", {
|
||||
toolName: "web_search",
|
||||
toolCoverage: {
|
||||
bucket: "openclaw-dynamic-integration",
|
||||
expectedLayer: "openclaw-dynamic",
|
||||
capabilityLayer: "openclaw-dynamic-direct",
|
||||
required: true,
|
||||
},
|
||||
}),
|
||||
],
|
||||
summary: {
|
||||
scenarios: [
|
||||
{
|
||||
name: "tool web_search",
|
||||
status: "fail",
|
||||
runtimeParity: {
|
||||
scenarioId: "tool-web-search",
|
||||
drift: "tool-call-shape",
|
||||
driftDetails: "Codex emitted no web_search call",
|
||||
cells: {
|
||||
pi: {
|
||||
runtime: "pi",
|
||||
transcriptBytes: "",
|
||||
toolCalls: [{ tool: "web_search", argsHash: "a", resultHash: "r" }],
|
||||
finalText: "",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
},
|
||||
codex: {
|
||||
runtime: "codex",
|
||||
transcriptBytes: "",
|
||||
toolCalls: [],
|
||||
finalText: "",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
generatedAt: "2026-05-10T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.failures).toEqual([
|
||||
"web-search drift=tool-call-shape (Codex emitted no web_search call)",
|
||||
]);
|
||||
});
|
||||
|
||||
it("fails untracked required tools missing from an evaluated summary", () => {
|
||||
const report = buildQaToolCoverageReport({
|
||||
scenarios: [
|
||||
makeScenario("tool-web-search", "web-search", {
|
||||
toolCoverage: {
|
||||
bucket: "openclaw-dynamic-integration",
|
||||
expectedLayer: "openclaw-dynamic",
|
||||
capabilityLayer: "openclaw-dynamic-direct",
|
||||
required: true,
|
||||
},
|
||||
}),
|
||||
],
|
||||
summary: {
|
||||
scenarios: [],
|
||||
},
|
||||
generatedAt: "2026-05-10T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.failures).toEqual(["web-search drift=not-run"]);
|
||||
});
|
||||
|
||||
it("rejects unknown runtime tool coverage buckets", () => {
|
||||
expect(() =>
|
||||
buildQaToolCoverageReport({
|
||||
@@ -301,5 +378,13 @@ describe("qa tool coverage report", () => {
|
||||
"#80173 Tavily tools are listed in the phase matrix but are not exposed by the current default tool surface.",
|
||||
}),
|
||||
);
|
||||
expect(report.rows.find((row) => row.tool === "web-search")).toEqual(
|
||||
expect.objectContaining({
|
||||
bucket: "openclaw-dynamic-integration",
|
||||
capabilityLayer: "openclaw-dynamic-direct",
|
||||
required: true,
|
||||
}),
|
||||
);
|
||||
expect(report.rows.find((row) => row.tool === "web-search")?.tracking).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -71,7 +71,7 @@ type ToolFixtureGroup = {
|
||||
scenarios: QaSeedScenarioWithSource[];
|
||||
};
|
||||
|
||||
const PASSING_DRIFTS: ReadonlySet<QaToolCoverageDrift> = new Set(["none", "text-only", "not-run"]);
|
||||
const PASSING_DRIFTS: ReadonlySet<QaToolCoverageDrift> = new Set(["none", "text-only"]);
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
@@ -173,6 +173,10 @@ function mergeScenarioResults(
|
||||
return failingResult;
|
||||
}
|
||||
|
||||
function isPassingToolCoverageDrift(drift: QaToolCoverageDrift, evaluated: boolean) {
|
||||
return PASSING_DRIFTS.has(drift) || (!evaluated && drift === "not-run");
|
||||
}
|
||||
|
||||
function buildRow(params: {
|
||||
group: ToolFixtureGroup;
|
||||
results: ReadonlyMap<string, RuntimeParityResult>;
|
||||
@@ -222,7 +226,9 @@ export function buildQaToolCoverageReport(params: {
|
||||
const evaluated = Boolean(params.summary);
|
||||
const failures = evaluated
|
||||
? rows
|
||||
.filter((row) => row.required && !row.tracking && !PASSING_DRIFTS.has(row.drift))
|
||||
.filter(
|
||||
(row) => row.required && !row.tracking && !isPassingToolCoverageDrift(row.drift, true),
|
||||
)
|
||||
.map((row) => `${row.tool} drift=${row.drift}${row.details ? ` (${row.details})` : ""}`)
|
||||
: [];
|
||||
return {
|
||||
@@ -237,7 +243,9 @@ export function buildQaToolCoverageReport(params: {
|
||||
dynamicIntegrationTools: rows.filter((row) => row.bucket === "openclaw-dynamic-integration")
|
||||
.length,
|
||||
optionalTools: rows.filter((row) => row.bucket === "optional-profile-or-plugin").length,
|
||||
passingTools: evaluated ? rows.filter((row) => PASSING_DRIFTS.has(row.drift)).length : 0,
|
||||
passingTools: evaluated
|
||||
? rows.filter((row) => isPassingToolCoverageDrift(row.drift, true)).length
|
||||
: 0,
|
||||
failingTools: failures.length,
|
||||
rows,
|
||||
pass: failures.length === 0,
|
||||
|
||||
@@ -28,7 +28,10 @@ Coverage tracking:
|
||||
Runtime parity tiers:
|
||||
|
||||
- `standard`: required Codex-vs-Pi mock gate coverage for first-hour depth and
|
||||
default runtime-tool fixtures; selected with
|
||||
default runtime-tool fixtures. OpenClaw dynamic integration tools in this
|
||||
tier are hard-gated by `openclaw qa coverage --tools --summary`; Codex-native
|
||||
workspace rows remain separately tracked until native/live behavior is the
|
||||
asserted surface. Selected with
|
||||
`openclaw qa suite --runtime-pair pi,codex --runtime-parity-tier standard`
|
||||
- `optional`: profile-, plugin-, or external-service-dependent runtime-tool
|
||||
fixtures that stay out of the default release gate
|
||||
|
||||
@@ -13,6 +13,7 @@ successCriteria:
|
||||
- Effective tools expose image_generate after QA image-generation config is applied.
|
||||
- The mock provider plans exactly one happy-path image_generate call.
|
||||
- The mock provider plans one denied-input failure-path image_generate call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- docs/tools/image-generation.md
|
||||
codeRefs:
|
||||
@@ -29,15 +30,12 @@ execution:
|
||||
actualTool: image_generate
|
||||
bucket: openclaw-dynamic-integration
|
||||
expectedLayer: openclaw-dynamic
|
||||
capabilityLayer: openclaw-dynamic-direct
|
||||
required: true
|
||||
tracking: "#80319"
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior
|
||||
reason: image_generate is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture.
|
||||
knownHarnessGap:
|
||||
issue: "#80319"
|
||||
reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture.
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: image_generate is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity.
|
||||
promptSnippet: "target=image_generate"
|
||||
failurePromptSnippet: "failure target=image_generate"
|
||||
```
|
||||
|
||||
@@ -13,6 +13,7 @@ successCriteria:
|
||||
- Effective tools expose session_status.
|
||||
- The mock provider plans exactly one happy-path session_status call.
|
||||
- The mock provider plans one denied-input failure-path session_status call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- qa/scenarios/index.md
|
||||
codeRefs:
|
||||
@@ -28,15 +29,12 @@ execution:
|
||||
actualTool: session_status
|
||||
bucket: openclaw-dynamic-integration
|
||||
expectedLayer: openclaw-dynamic
|
||||
capabilityLayer: openclaw-dynamic-direct
|
||||
required: true
|
||||
tracking: "#80319"
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior
|
||||
reason: session_status is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture.
|
||||
knownHarnessGap:
|
||||
issue: "#80319"
|
||||
reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture.
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: session_status is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity.
|
||||
promptSnippet: "target=session_status"
|
||||
failurePromptSnippet: "failure target=session_status"
|
||||
```
|
||||
|
||||
@@ -13,6 +13,7 @@ successCriteria:
|
||||
- Effective tools expose sessions_spawn.
|
||||
- The mock provider plans exactly one happy-path sessions_spawn call.
|
||||
- The mock provider plans one denied-input failure-path sessions_spawn call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- qa/scenarios/index.md
|
||||
codeRefs:
|
||||
@@ -28,15 +29,12 @@ execution:
|
||||
actualTool: sessions_spawn
|
||||
bucket: openclaw-dynamic-integration
|
||||
expectedLayer: openclaw-dynamic
|
||||
capabilityLayer: openclaw-dynamic-direct
|
||||
required: true
|
||||
tracking: "#80319"
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior
|
||||
reason: sessions_spawn is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture.
|
||||
knownHarnessGap:
|
||||
issue: "#80319"
|
||||
reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture.
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: sessions_spawn is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity.
|
||||
promptSnippet: "target=sessions_spawn"
|
||||
failurePromptSnippet: "failure target=sessions_spawn"
|
||||
```
|
||||
|
||||
@@ -13,6 +13,7 @@ successCriteria:
|
||||
- Effective tools expose web_fetch.
|
||||
- The mock provider plans exactly one happy-path web_fetch call.
|
||||
- The mock provider plans one denied-input failure-path web_fetch call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- qa/scenarios/index.md
|
||||
codeRefs:
|
||||
@@ -28,15 +29,12 @@ execution:
|
||||
actualTool: web_fetch
|
||||
bucket: openclaw-dynamic-integration
|
||||
expectedLayer: openclaw-dynamic
|
||||
capabilityLayer: openclaw-dynamic-direct
|
||||
required: true
|
||||
tracking: "#80319"
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior
|
||||
reason: web_fetch is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture.
|
||||
knownHarnessGap:
|
||||
issue: "#80319"
|
||||
reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture.
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: web_fetch is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity.
|
||||
promptSnippet: "target=web_fetch"
|
||||
failurePromptSnippet: "failure target=web_fetch"
|
||||
```
|
||||
|
||||
@@ -13,6 +13,7 @@ successCriteria:
|
||||
- Effective tools expose web_search.
|
||||
- The mock provider plans exactly one happy-path web_search call.
|
||||
- The mock provider plans one denied-input failure-path web_search call.
|
||||
- Runtime parity coverage hard-fails call/result drift in the standard direct-loading gate.
|
||||
docsRefs:
|
||||
- qa/scenarios/index.md
|
||||
codeRefs:
|
||||
@@ -28,15 +29,12 @@ execution:
|
||||
actualTool: web_search
|
||||
bucket: openclaw-dynamic-integration
|
||||
expectedLayer: openclaw-dynamic
|
||||
capabilityLayer: openclaw-dynamic-direct
|
||||
required: true
|
||||
tracking: "#80319"
|
||||
codexDefaultImpact: P4
|
||||
qaImpact: P1
|
||||
action: teach fixture/mock planner Codex searchable OpenClaw dynamic tool behavior
|
||||
reason: web_search is an OpenClaw integration tool; QA mock provider does not yet model Codex searchable/deferred dynamic tool declarations for this fixture.
|
||||
knownHarnessGap:
|
||||
issue: "#80319"
|
||||
reason: QA mock provider does not yet model Codex searchable/deferred OpenClaw dynamic tool declarations for this fixture.
|
||||
action: hard gate in the standard direct-loading tier
|
||||
reason: web_search is an OpenClaw integration tool and must stay visible and callable under Pi and Codex direct runtime parity.
|
||||
promptSnippet: "target=web_search"
|
||||
failurePromptSnippet: "failure target=web_search"
|
||||
```
|
||||
|
||||
@@ -4,6 +4,8 @@ type Options = {
|
||||
json?: boolean;
|
||||
output?: string;
|
||||
repoRoot?: string;
|
||||
summary?: string;
|
||||
tools?: boolean;
|
||||
};
|
||||
|
||||
function takeValue(args: string[], index: number, flag: string): string {
|
||||
@@ -27,6 +29,8 @@ Options:
|
||||
--json Print machine-readable JSON
|
||||
--output <path> Write the report to a file
|
||||
--repo-root <path> Repository root to target
|
||||
--summary <path> Runtime qa-suite-summary.json to overlay on --tools coverage
|
||||
--tools Print runtime tool fixture coverage instead of scenario coverage
|
||||
-h, --help Display help
|
||||
`);
|
||||
process.exit(0);
|
||||
@@ -41,6 +45,13 @@ Options:
|
||||
opts.repoRoot = takeValue(args, index, arg);
|
||||
index += 1;
|
||||
break;
|
||||
case "--summary":
|
||||
opts.summary = takeValue(args, index, arg);
|
||||
index += 1;
|
||||
break;
|
||||
case "--tools":
|
||||
opts.tools = true;
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unknown qa coverage option: ${arg}`);
|
||||
}
|
||||
@@ -53,4 +64,6 @@ await runQaCoverageReportCommand({
|
||||
...(opts.json ? { json: true } : {}),
|
||||
...(opts.output ? { output: opts.output } : {}),
|
||||
...(opts.repoRoot ? { repoRoot: opts.repoRoot } : {}),
|
||||
...(opts.summary ? { summary: opts.summary } : {}),
|
||||
...(opts.tools ? { tools: true } : {}),
|
||||
});
|
||||
|
||||
@@ -1093,7 +1093,14 @@ describe("run-node script", () => {
|
||||
|
||||
const exitCode = await runNodeMain({
|
||||
cwd: tmp,
|
||||
args: ["qa", "coverage", "--json"],
|
||||
args: [
|
||||
"qa",
|
||||
"coverage",
|
||||
"--json",
|
||||
"--tools",
|
||||
"--summary",
|
||||
".artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json",
|
||||
],
|
||||
env: {
|
||||
...process.env,
|
||||
OPENCLAW_RUNNER_LOG: "0",
|
||||
@@ -1111,6 +1118,9 @@ describe("run-node script", () => {
|
||||
"tsx",
|
||||
path.join(tmp, "scripts", "qa-coverage-report.ts"),
|
||||
"--json",
|
||||
"--tools",
|
||||
"--summary",
|
||||
".artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json",
|
||||
],
|
||||
]);
|
||||
});
|
||||
|
||||
@@ -490,6 +490,45 @@ describe("scripts/lib/plugin-prerelease-test-plan.mjs", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps runtime tool coverage blocking in release checks", () => {
|
||||
const releaseChecksSource = readFileSync(
|
||||
".github/workflows/openclaw-release-checks.yml",
|
||||
"utf8",
|
||||
);
|
||||
const releaseChecksWorkflow = parse(releaseChecksSource);
|
||||
const runtimeToolCoverage = releaseChecksWorkflow.jobs.runtime_tool_coverage_release_checks;
|
||||
|
||||
expect(runtimeToolCoverage["continue-on-error"]).toBeUndefined();
|
||||
expect(runtimeToolCoverage.needs).toEqual([
|
||||
"resolve_target",
|
||||
"qa_lab_runtime_parity_release_checks",
|
||||
]);
|
||||
expect(runtimeToolCoverage.steps).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: "Enforce standard runtime tool coverage",
|
||||
run: expect.stringContaining("pnpm openclaw qa coverage"),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(runtimeToolCoverage.steps).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: "Enforce standard runtime tool coverage",
|
||||
run: expect.stringContaining(
|
||||
"--summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json",
|
||||
),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(releaseChecksWorkflow.jobs.summary.needs).toContain(
|
||||
"runtime_tool_coverage_release_checks",
|
||||
);
|
||||
expect(releaseChecksSource).toContain(
|
||||
'"runtime_tool_coverage_release_checks=${{ needs.runtime_tool_coverage_release_checks.result }}"',
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps the live-ish availability check redacted", () => {
|
||||
const output = execFileSync(
|
||||
process.execPath,
|
||||
|
||||
Reference in New Issue
Block a user