diff --git a/.github/workflows/parity-gate.yml b/.github/workflows/parity-gate.yml deleted file mode 100644 index b8cf6cc137e..00000000000 --- a/.github/workflows/parity-gate.yml +++ /dev/null @@ -1,109 +0,0 @@ -name: Parity gate - -on: - schedule: - - cron: "17 3 * * *" - release: - types: [published] - workflow_dispatch: - -permissions: - contents: read - -concurrency: - group: parity-gate-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true - -jobs: - parity-gate: - name: Run the OpenAI / Opus 4.6 parity gate against the qa-lab mock - runs-on: blacksmith-32vcpu-ubuntu-2404 - timeout-minutes: 30 - env: - # Fence the gate off from any real provider credentials. The qa-lab - # mock server + auth staging (PR N) should be enough to produce a - # meaningful verdict without touching a real API. If any of these - # leak into the job env, fail hard instead of silently running - # against a live provider and burning real budget. - # - # The parity pack has 11 isolated scenario workers. It exercises a real - # gateway child plus mock model turns and subagents, so keep it serial in - # CI even on the larger runner. Concurrent isolated gateway workers make - # the short strict-agentic scenarios flaky, especially the approval-turn - # followthrough gate that expects a fast post-approval read within a 30s - # agent.wait timeout. - QA_PARITY_CONCURRENCY: "1" - OPENCLAW_CI_OPENAI_MODEL: ${{ vars.OPENCLAW_CI_OPENAI_MODEL || 'openai/gpt-5.5' }} - OPENCLAW_QA_TRANSPORT_READY_TIMEOUT_MS: "180000" - OPENAI_API_KEY: "" - ANTHROPIC_API_KEY: "" - OPENCLAW_LIVE_OPENAI_KEY: "" - OPENCLAW_LIVE_ANTHROPIC_KEY: "" - OPENCLAW_LIVE_GEMINI_KEY: "" - OPENCLAW_LIVE_SETUP_TOKEN_VALUE: "" - # The parity suite is a private QA command. Build that exact runtime up - # front so CI never tests a public dist plus a later no-clean QA overlay. - OPENCLAW_BUILD_PRIVATE_QA: "1" - OPENCLAW_ENABLE_PRIVATE_QA_CLI: "1" - steps: - - name: Checkout PR - uses: actions/checkout@v6 - with: - persist-credentials: false - - - name: Install pnpm - uses: pnpm/action-setup@b906affcce14559ad1aafd4ab0e942779e9f58b1 - - - name: Setup Node - uses: actions/setup-node@v6 - with: - node-version: "22.18.0" - cache: "pnpm" - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build private QA runtime - run: pnpm build - - # The approval-turn sentinel still runs inside the full parity pack below. - # Keep the exact mock read-plan contract in deterministic unit tests instead - # of paying for a separate full-runtime preflight that has been flaky in CI. - - name: Run OpenAI candidate lane - run: | - pnpm openclaw qa suite \ - --provider-mode mock-openai \ - --parity-pack agentic \ - --concurrency "${QA_PARITY_CONCURRENCY}" \ - --model "${OPENCLAW_CI_OPENAI_MODEL}" \ - --alt-model openai/gpt-5.4-alt \ - --output-dir .artifacts/qa-e2e/gpt54 - - - name: Run Opus 4.6 lane - run: | - pnpm openclaw qa suite \ - --provider-mode mock-openai \ - --parity-pack agentic \ - --concurrency "${QA_PARITY_CONCURRENCY}" \ - --model anthropic/claude-opus-4-6 \ - --alt-model anthropic/claude-sonnet-4-6 \ - --output-dir .artifacts/qa-e2e/opus46 - - - name: Generate parity report - run: | - pnpm openclaw qa parity-report \ - --repo-root . \ - --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ - --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ - --candidate-label "${OPENCLAW_CI_OPENAI_MODEL}" \ - --baseline-label anthropic/claude-opus-4-6 \ - --output-dir .artifacts/qa-e2e/parity - - - name: Upload parity artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: parity-gate-${{ github.event.pull_request.number || github.sha }} - path: .artifacts/qa-e2e/ - retention-days: 14 - if-no-files-found: warn diff --git a/.github/workflows/qa-live-transports-convex.yml b/.github/workflows/qa-live-transports-convex.yml index 4827954fe5a..b5808910568 100644 --- a/.github/workflows/qa-live-transports-convex.yml +++ b/.github/workflows/qa-live-transports-convex.yml @@ -141,7 +141,7 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" run_mock_parity: - name: Run QA Lab parity gate + name: Run QA Lab mock parity lane needs: [validate_selected_ref] runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 30 diff --git a/docs/ci.md b/docs/ci.md index dd587467bb3..50cd661b98a 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -383,10 +383,9 @@ The scheduled live/E2E workflow runs the full release-path Docker suite daily. ## QA Lab -QA Lab has dedicated CI lanes outside the main smart-scoped workflow. +QA Lab has dedicated CI lanes outside the main smart-scoped workflow. Agentic parity is nested under the broad QA and release harnesses, not a standalone PR workflow. Use `Full Release Validation` with `rerun_group=qa-parity` when parity should ride with a broad validation run. -- The `Parity gate` workflow runs on matching PR changes and manual dispatch; it builds the private QA runtime and compares the mock GPT-5.5 and Opus 4.6 agentic packs. -- The `QA-Lab - All Lanes` workflow runs nightly on `main` and on manual dispatch; it fans out the mock parity gate, live Matrix lane, and live Telegram and Discord lanes as parallel jobs. Live jobs use the `qa-live-shared` environment, and Telegram/Discord use Convex leases. +- The `QA-Lab - All Lanes` workflow runs nightly on `main` and on manual dispatch; it fans out the mock parity lane, live Matrix lane, and live Telegram and Discord lanes as parallel jobs. Live jobs use the `qa-live-shared` environment, and Telegram/Discord use Convex leases. Release checks run Matrix and Telegram live transport lanes with the deterministic mock provider and mock-qualified models (`mock-openai/gpt-5.5` and `mock-openai/gpt-5.5-alt`) so the channel contract is isolated from live model latency and normal provider-plugin startup. The live transport gateway disables memory search because QA parity covers memory behavior separately; provider connectivity is covered by the separate live model, native provider, and Docker provider suites. @@ -394,7 +393,7 @@ Matrix uses `--profile fast` for scheduled and release gates, adding `--fail-fas `OpenClaw Release Checks` also runs the release-critical QA Lab lanes before release approval; its QA parity gate runs the candidate and baseline packs as parallel lane jobs, then downloads both artifacts into a small report job for the final parity comparison. -Do not put the PR landing path behind `Parity gate` unless the change actually touches QA runtime, model-pack parity, or a surface the parity workflow owns. For normal channel, config, docs, or unit-test fixes, treat it as an optional signal and follow the scoped CI/check evidence instead. +For normal PRs, follow scoped CI/check evidence instead of treating parity as a required status. ## CodeQL diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 36c3e9065ad..8e8b1112223 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -32,7 +32,7 @@ script aliases; both forms are supported. | `qa run` | Bundled QA self-check; writes a Markdown report. | | `qa suite` | Run repo-backed scenarios against the QA gateway lane. Aliases: `pnpm openclaw qa suite --runner multipass` for a disposable Linux VM. | | `qa coverage` | Print the markdown scenario-coverage inventory (`--json` for machine output). | -| `qa parity-report` | Compare two `qa-suite-summary.json` files and write the agentic parity-gate report. | +| `qa parity-report` | Compare two `qa-suite-summary.json` files and write the agentic parity report. | | `qa character-eval` | Run the character QA scenario across multiple live models with a judged report. See [Reporting](#reporting). | | `qa manual` | Run a one-off prompt against the selected provider/model lane. | | `qa ui` | Start the QA debugger UI and local QA bus (alias: `pnpm qa:lab:ui`). | diff --git a/docs/help/testing.md b/docs/help/testing.md index 4a0a8e971cf..ad72328b404 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -109,18 +109,21 @@ When you only need one failing case, prefer narrowing live tests via the allowli These commands sit beside the main test suites when you need QA-lab realism: -CI runs QA Lab in dedicated workflows. `Parity gate` runs on matching PRs and -from manual dispatch with mock providers. `QA-Lab - All Lanes` runs nightly on -`main` and from manual dispatch with the mock parity gate, live Matrix lane, -Convex-managed live Telegram lane, and Convex-managed live Discord lane as -parallel jobs. Scheduled QA and release checks pass Matrix `--profile fast` -explicitly, while the Matrix CLI and manual workflow input default remain -`all`; manual dispatch can shard `all` into `transport`, `media`, `e2ee-smoke`, -`e2ee-deep`, and `e2ee-cli` jobs. `OpenClaw Release Checks` runs parity plus -the fast Matrix and Telegram lanes before release approval, using -`mock-openai/gpt-5.5` for release transport checks so they stay deterministic -and avoid normal provider-plugin startup. These live transport gateways disable -memory search; memory behavior stays covered by the QA parity suites. +CI runs QA Lab in dedicated workflows. Agentic parity is nested under +`QA-Lab - All Lanes` and release validation, not a standalone PR workflow. +Broad validation should use `Full Release Validation` with +`rerun_group=qa-parity` or the release-checks QA group. `QA-Lab - All Lanes` +runs nightly on `main` and from manual dispatch with the mock parity lane, live +Matrix lane, Convex-managed live Telegram lane, and Convex-managed live Discord +lane as parallel jobs. Scheduled QA and release checks pass Matrix +`--profile fast` explicitly, while the Matrix CLI and manual workflow input +default remain `all`; manual dispatch can shard `all` into `transport`, +`media`, `e2ee-smoke`, `e2ee-deep`, and `e2ee-cli` jobs. `OpenClaw Release +Checks` runs parity plus the fast Matrix and Telegram lanes before release +approval, using `mock-openai/gpt-5.5` for release transport checks so they stay +deterministic and avoid normal provider-plugin startup. These live transport +gateways disable memory search; memory behavior stays covered by the QA parity +suites. Full release live media shards use `ghcr.io/openclaw/openclaw-live-media-runner:ubuntu-24.04`, which already has diff --git a/docs/reference/RELEASING.md b/docs/reference/RELEASING.md index c9f0cebec9e..118b05e1df0 100644 --- a/docs/reference/RELEASING.md +++ b/docs/reference/RELEASING.md @@ -173,7 +173,7 @@ the maintainer-only release runbook. plugins. - Release checks now run in a separate manual workflow: `OpenClaw Release Checks` -- `OpenClaw Release Checks` also runs the QA Lab mock parity gate plus the fast +- `OpenClaw Release Checks` also runs the QA Lab mock parity lane plus the fast live Matrix profile and Telegram QA lane before release approval. The live lanes use the `qa-live-shared` environment; Telegram also uses Convex CI credential leases. Run the manual `QA-Lab - All Lanes` workflow with @@ -443,7 +443,7 @@ package mechanics. Release QA Lab coverage includes: -- mock parity gate comparing the OpenAI candidate lane against the Opus 4.6 +- mock parity lane comparing the OpenAI candidate lane against the Opus 4.6 baseline using the agentic parity pack - fast live Matrix QA profile using the `qa-live-shared` environment - live Telegram QA lane using Convex CI credential leases diff --git a/scripts/check-docker-e2e-boundaries.mjs b/scripts/check-docker-e2e-boundaries.mjs index 99ef52d0d06..268a9dab3da 100644 --- a/scripts/check-docker-e2e-boundaries.mjs +++ b/scripts/check-docker-e2e-boundaries.mjs @@ -58,6 +58,7 @@ function validateUniqueLanes(label, lanes) { } function validateLane(label, lane) { + const resources = laneResources(lane); if (!lane.name || typeof lane.name !== "string") { errors.push(`${label}: Docker E2E lane is missing a string name`); } @@ -70,7 +71,7 @@ function validateLane(label, lane) { `${label}: Docker E2E lane '${lane.name}' has invalid image kind '${lane.e2eImageKind}'`, ); } - if (lane.live && lane.e2eImageKind) { + if (lane.live && lane.e2eImageKind && !resources.includes("npm")) { errors.push(`${label}: live Docker E2E lane '${lane.name}' must not require a package image`); } if (!lane.live && !lane.e2eImageKind) { @@ -79,7 +80,7 @@ function validateLane(label, lane) { if (laneWeight(lane) < 1) { errors.push(`${label}: Docker E2E lane '${lane.name}' must have positive weight`); } - if (!laneResources(lane).includes("docker")) { + if (!resources.includes("docker")) { errors.push(`${label}: Docker E2E lane '${lane.name}' must include the docker resource`); }