From 21e2168b8f4ab80693364a2f460330bef0ae56f0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 29 Apr 2026 22:23:54 +0100 Subject: [PATCH] ci: shard install smoke release checks --- .agents/skills/openclaw-testing/SKILL.md | 13 +- .github/workflows/full-release-validation.yml | 58 ++++- .github/workflows/install-smoke.yml | 208 +++++++++++++++--- .github/workflows/openclaw-release-checks.yml | 2 + docs/ci.md | 2 +- docs/reference/RELEASING.md | 3 + 6 files changed, 248 insertions(+), 38 deletions(-) diff --git a/.agents/skills/openclaw-testing/SKILL.md b/.agents/skills/openclaw-testing/SKILL.md index b674037e202..a40d18d1994 100644 --- a/.agents/skills/openclaw-testing/SKILL.md +++ b/.agents/skills/openclaw-testing/SKILL.md @@ -149,9 +149,9 @@ Use `release_profile=minimum|stable|full` to control live/provider breadth: `minimum` keeps the fastest OpenAI/core release-critical set, `stable` adds the stable provider/backend set, and `full` adds the broad advisory provider/media matrix. Do not make `full` faster by silently dropping suites; optimize setup, -artifact reuse, and sharding instead. The parent verifier job appends -slowest-job tables for child runs; rerun only that verifier after a child rerun -turns green. +artifact reuse, and sharding instead. The parent verifier job appends a child +overview plus slowest-job tables for child runs; rerun only that verifier after +a child rerun turns green. Standalone manual `CI` dispatches do not run the plugin prerelease suite, the extension batch sweep, or the release-only `agentic-plugins` Vitest shard. Those @@ -245,6 +245,13 @@ When `Full Release Validation` dispatches release checks, it passes the requeste branch/tag plus an `expected_sha` so branch/tag refs resolve through the fast remote-ref path while the package and QA jobs still validate the exact SHA. +The full install-smoke child is split on purpose: one job prepares or reuses the +target-SHA GHCR root Dockerfile smoke image, QR package install runs in its own +job, root Dockerfile/gateway smokes pull the prepared image, and installer/Bun +smokes pull the same image while building only their small installer images. +If install-smoke gets slow again, first check whether the root image was reused +or rebuilt before adding/removing coverage. + The full-profile native live media shards use the prebuilt `ghcr.io/openclaw/openclaw-live-media-runner:ubuntu-24.04` container so `ffmpeg`/`ffprobe` are already present. If those jobs suddenly spend minutes in diff --git a/.github/workflows/full-release-validation.yml b/.github/workflows/full-release-validation.yml index e2a7a88af00..c765ddb6522 100644 --- a/.github/workflows/full-release-validation.yml +++ b/.github/workflows/full-release-validation.yml @@ -610,20 +610,64 @@ jobs: return 1 fi - local status conclusion url attempt - status="$(gh run view "$run_id" --json status --jq '.status')" - conclusion="$(gh run view "$run_id" --json conclusion --jq '.conclusion')" - url="$(gh run view "$run_id" --json url --jq '.url')" - attempt="$(gh run view "$run_id" --json attempt --jq '.attempt')" + local run_json status conclusion url attempt + run_json="$(gh run view "$run_id" --json status,conclusion,url,attempt,jobs)" + status="$(jq -r '.status' <<< "$run_json")" + conclusion="$(jq -r '.conclusion' <<< "$run_json")" + url="$(jq -r '.url' <<< "$run_json")" + attempt="$(jq -r '.attempt' <<< "$run_json")" echo "${label}: ${status}/${conclusion} attempt ${attempt}: ${url}" if [[ "$status" != "completed" || "$conclusion" != "success" ]]; then echo "::error::${label} child run ended with ${status}/${conclusion}: ${url}" - gh run view "$run_id" --json jobs --jq '.jobs[] | select(.conclusion != "success" and .conclusion != "skipped") | {name, status, conclusion, url}' || true + jq '.jobs[] | select(.conclusion != "success" and .conclusion != "skipped") | {name, status, conclusion, url}' <<< "$run_json" || true return 1 fi } + append_child_overview() { + { + echo + echo "### Child workflow overview" + echo + echo "| Child | Result | Minutes | Run |" + echo "| --- | --- | ---: | --- |" + } >> "$GITHUB_STEP_SUMMARY" + + append_child_row() { + local label="$1" + local run_id="$2" + local result="$3" + + if [[ -z "${run_id// }" ]]; then + echo "| \`${label}\` | \`${result}\` | | skipped |" >> "$GITHUB_STEP_SUMMARY" + return 0 + fi + + local run_json row + run_json="$(gh run view "$run_id" --json status,conclusion,url,createdAt,updatedAt)" + row="$( + jq -r --arg label "$label" ' + def ts: fromdateiso8601; + . as $run | + ($run.createdAt // "") as $created | + ($run.updatedAt // "") as $updated | + (if ($created | length) > 0 and ($updated | length) > 0 + then (((($updated | ts) - ($created | ts)) / 60) * 10 | round / 10 | tostring) + else "" + end) as $minutes | + "| `" + $label + "` | `" + ($run.status // "") + "/" + ($run.conclusion // "") + "` | " + $minutes + " | [run](" + ($run.url // "") + ") |" + ' <<< "$run_json" + )" + echo "$row" >> "$GITHUB_STEP_SUMMARY" + } + + append_child_row "normal_ci" "$NORMAL_CI_RUN_ID" "$NORMAL_CI_RESULT" + append_child_row "plugin_prerelease" "$PLUGIN_PRERELEASE_RUN_ID" "$PLUGIN_PRERELEASE_RESULT" + append_child_row "release_checks" "$RELEASE_CHECKS_RUN_ID" "$RELEASE_CHECKS_RESULT" + append_child_row "npm_telegram" "$NPM_TELEGRAM_RUN_ID" "$NPM_TELEGRAM_RESULT" + } + summarize_child_timing() { local label="$1" local run_id="$2" @@ -675,6 +719,8 @@ jobs: failed=0 + append_child_overview + if [[ "$NORMAL_CI_RESULT" == "skipped" && -z "${NORMAL_CI_RUN_ID// }" ]]; then check_child "normal_ci" "" 0 || failed=1 else diff --git a/.github/workflows/install-smoke.yml b/.github/workflows/install-smoke.yml index 1369b3e294c..1d733a11ff4 100644 --- a/.github/workflows/install-smoke.yml +++ b/.github/workflows/install-smoke.yml @@ -34,10 +34,11 @@ on: permissions: contents: read + packages: write concurrency: - group: ${{ github.event_name == 'workflow_dispatch' && format('{0}-manual-{1}', github.workflow, github.run_id) || format('{0}-{1}', github.workflow, github.ref) }} - cancel-in-progress: true + group: ${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && format('{0}-{1}-{2}', github.workflow, github.event_name, github.run_id) || format('{0}-{1}', github.workflow, github.ref) }} + cancel-in-progress: ${{ github.event_name != 'workflow_call' }} env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" @@ -51,6 +52,9 @@ jobs: run_fast_install_smoke: ${{ steps.manifest.outputs.run_fast_install_smoke }} run_full_install_smoke: ${{ steps.manifest.outputs.run_full_install_smoke }} run_bun_global_install_smoke: ${{ steps.manifest.outputs.run_bun_global_install_smoke }} + target_sha: ${{ steps.manifest.outputs.target_sha }} + dockerfile_image: ${{ steps.manifest.outputs.dockerfile_image }} + dockerfile_cache_scope: ${{ steps.manifest.outputs.dockerfile_cache_scope }} steps: - name: Checkout uses: actions/checkout@v6 @@ -74,6 +78,10 @@ jobs: run_full_install_smoke=true run_bun_global_install_smoke=false run_install_smoke=true + target_sha="$(git rev-parse HEAD)" + owner="$(printf '%s' "${GITHUB_REPOSITORY_OWNER:-openclaw}" | tr '[:upper:]' '[:lower:]')" + dockerfile_image="ghcr.io/${owner}/openclaw-dockerfile-smoke:${target_sha}" + dockerfile_cache_scope="openclaw-dockerfile-smoke" if [ "$event_name" = "schedule" ]; then run_bun_global_install_smoke=true elif [ "$event_name" = "workflow_dispatch" ] || [ "$event_name" = "workflow_call" ]; then @@ -87,6 +95,9 @@ jobs: echo "run_fast_install_smoke=$run_fast_install_smoke" echo "run_full_install_smoke=$run_full_install_smoke" echo "run_bun_global_install_smoke=$run_bun_global_install_smoke" + echo "target_sha=$target_sha" + echo "dockerfile_image=$dockerfile_image" + echo "dockerfile_cache_scope=$dockerfile_cache_scope" } >> "$GITHUB_OUTPUT" install-smoke-fast: @@ -196,10 +207,12 @@ jobs: " ' - install-smoke: + root_dockerfile_image: needs: [preflight] if: needs.preflight.outputs.run_full_install_smoke == 'true' runs-on: blacksmith-16vcpu-ubuntu-2404 + outputs: + image_ref: ${{ steps.image.outputs.image_ref }} env: DOCKER_BUILD_SUMMARY: "false" DOCKER_BUILD_RECORD_UPLOAD: "false" @@ -209,48 +222,130 @@ jobs: with: ref: ${{ inputs.ref || github.ref }} + - name: Log in to GHCR + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Check for existing root Dockerfile smoke image + id: existing + env: + IMAGE_REF: ${{ needs.preflight.outputs.dockerfile_image }} + run: | + set -euo pipefail + if timeout 180s docker pull "$IMAGE_REF"; then + echo "exists=true" >> "$GITHUB_OUTPUT" + echo "Using existing root Dockerfile smoke image: \`$IMAGE_REF\`" >> "$GITHUB_STEP_SUMMARY" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + echo "No existing root Dockerfile smoke image found for \`$IMAGE_REF\`; building it." >> "$GITHUB_STEP_SUMMARY" + fi + - name: Set up Blacksmith Docker Builder + if: steps.existing.outputs.exists != 'true' uses: useblacksmith/setup-docker-builder@ac083cc84672d01c60d5e8561d0a939b697de542 # v1 with: max-cache-size-mb: 800000 + # Build once with the matrix extension and publish by target SHA. Use a + # direct buildx command so release jobs emit Docker progress and time out. + - name: Build and push root Dockerfile smoke image + if: steps.existing.outputs.exists != 'true' + env: + CACHE_SCOPE: ${{ needs.preflight.outputs.dockerfile_cache_scope }} + IMAGE_REF: ${{ needs.preflight.outputs.dockerfile_image }} + run: | + timeout 45m docker buildx build \ + --progress=plain \ + --push \ + --cache-from "type=gha,scope=${CACHE_SCOPE}" \ + --cache-to "type=gha,scope=${CACHE_SCOPE},mode=max" \ + --build-arg OPENCLAW_EXTENSIONS=matrix \ + -t "$IMAGE_REF" \ + -f ./Dockerfile \ + . + + - name: Record root image output + id: image + env: + IMAGE_REF: ${{ needs.preflight.outputs.dockerfile_image }} + run: echo "image_ref=$IMAGE_REF" >> "$GITHUB_OUTPUT" + + - name: Summarize root image + env: + IMAGE_REF: ${{ needs.preflight.outputs.dockerfile_image }} + TARGET_SHA: ${{ needs.preflight.outputs.target_sha }} + run: | + { + echo "## Root Dockerfile smoke image" + echo + echo "- Target SHA: \`${TARGET_SHA}\`" + echo "- Image: \`${IMAGE_REF}\`" + echo "- Reused existing image: \`${{ steps.existing.outputs.exists }}\`" + } >> "$GITHUB_STEP_SUMMARY" + + qr_package_install_smoke: + needs: [preflight] + if: needs.preflight.outputs.run_full_install_smoke == 'true' + runs-on: blacksmith-16vcpu-ubuntu-2404 + steps: + - name: Checkout CLI + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.ref }} + - name: Run QR package install smoke env: OPENCLAW_QR_SMOKE_FORCE_INSTALL: "1" run: bash scripts/e2e/qr-import-docker.sh - # Build once with the matrix extension and tag both smoke names. Use a - # direct buildx command so release jobs emit Docker progress and time out. - - name: Build root Dockerfile smoke image - run: | - timeout 45m docker buildx build \ - --progress=plain \ - --load \ - --build-arg OPENCLAW_EXTENSIONS=matrix \ - -t openclaw-dockerfile-smoke:local \ - -t openclaw-ext-smoke:local \ - -f ./Dockerfile \ - . + root_dockerfile_smokes: + needs: [preflight, root_dockerfile_image] + if: needs.preflight.outputs.run_full_install_smoke == 'true' + runs-on: blacksmith-16vcpu-ubuntu-2404 + steps: + - name: Checkout CLI + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Log in to GHCR + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Pull root Dockerfile smoke image + env: + IMAGE_REF: ${{ needs.root_dockerfile_image.outputs.image_ref }} + run: timeout 300s docker pull "$IMAGE_REF" - name: Run root Dockerfile CLI smoke + env: + IMAGE_REF: ${{ needs.root_dockerfile_image.outputs.image_ref }} run: | - docker run --rm --entrypoint sh openclaw-dockerfile-smoke:local -lc 'which openclaw && openclaw --version' + docker run --rm --entrypoint sh "$IMAGE_REF" -lc 'which openclaw && openclaw --version' - name: Run agents delete shared workspace Docker CLI smoke env: - OPENCLAW_AGENTS_DELETE_SHARED_WORKSPACE_E2E_IMAGE: openclaw-dockerfile-smoke:local + OPENCLAW_AGENTS_DELETE_SHARED_WORKSPACE_E2E_IMAGE: ${{ needs.root_dockerfile_image.outputs.image_ref }} OPENCLAW_AGENTS_DELETE_SHARED_WORKSPACE_E2E_SKIP_BUILD: "1" run: bash scripts/e2e/agents-delete-shared-workspace-docker.sh - name: Run Docker gateway network e2e env: - OPENCLAW_GATEWAY_NETWORK_E2E_IMAGE: openclaw-dockerfile-smoke:local + OPENCLAW_GATEWAY_NETWORK_E2E_IMAGE: ${{ needs.root_dockerfile_image.outputs.image_ref }} OPENCLAW_GATEWAY_NETWORK_E2E_SKIP_BUILD: "1" run: bash scripts/e2e/gateway-network-docker.sh - name: Smoke test Dockerfile with matrix extension build arg + env: + IMAGE_REF: ${{ needs.root_dockerfile_image.outputs.image_ref }} run: | - docker run --rm --entrypoint sh openclaw-ext-smoke:local -lc ' + docker run --rm --entrypoint sh "$IMAGE_REF" -lc ' which openclaw && openclaw --version && node -e " @@ -293,6 +388,36 @@ jobs: " ' + installer_smoke: + needs: [preflight, root_dockerfile_image] + if: needs.preflight.outputs.run_full_install_smoke == 'true' + runs-on: blacksmith-16vcpu-ubuntu-2404 + env: + DOCKER_BUILD_SUMMARY: "false" + DOCKER_BUILD_RECORD_UPLOAD: "false" + steps: + - name: Checkout CLI + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Log in to GHCR + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Pull root Dockerfile smoke image + env: + IMAGE_REF: ${{ needs.root_dockerfile_image.outputs.image_ref }} + run: timeout 300s docker pull "$IMAGE_REF" + + - name: Set up Blacksmith Docker Builder + uses: useblacksmith/setup-docker-builder@ac083cc84672d01c60d5e8561d0a939b697de542 # v1 + with: + max-cache-size-mb: 800000 + - name: Build installer smoke image run: | timeout 20m docker buildx build \ @@ -314,16 +439,9 @@ jobs: - name: Setup Node environment for installer smoke uses: ./.github/actions/setup-node-env with: - install-bun: ${{ needs.preflight.outputs.run_bun_global_install_smoke }} + install-bun: "false" install-deps: "true" - - name: Run Bun global install image-provider smoke - if: needs.preflight.outputs.run_bun_global_install_smoke == 'true' - env: - OPENCLAW_BUN_GLOBAL_SMOKE_DIST_IMAGE: openclaw-dockerfile-smoke:local - OPENCLAW_BUN_GLOBAL_SMOKE_HOST_BUILD: "0" - run: bash scripts/e2e/bun-global-install-smoke.sh - - name: Run installer docker tests env: OPENCLAW_INSTALL_URL: https://openclaw.ai/install.sh @@ -336,10 +454,44 @@ jobs: OPENCLAW_INSTALL_SMOKE_SKIP_NPM_GLOBAL: "1" OPENCLAW_INSTALL_SMOKE_SKIP_PREVIOUS: "1" OPENCLAW_INSTALL_SMOKE_UPDATE_BASELINE: ${{ inputs.update_baseline_version || 'latest' }} - OPENCLAW_INSTALL_SMOKE_UPDATE_DIST_IMAGE: openclaw-dockerfile-smoke:local + OPENCLAW_INSTALL_SMOKE_UPDATE_DIST_IMAGE: ${{ needs.root_dockerfile_image.outputs.image_ref }} OPENCLAW_INSTALL_SMOKE_UPDATE_SKIP_LOCAL_BUILD: "1" run: bash scripts/test-install-sh-docker.sh + bun_global_install_smoke: + needs: [preflight, root_dockerfile_image] + if: needs.preflight.outputs.run_full_install_smoke == 'true' && needs.preflight.outputs.run_bun_global_install_smoke == 'true' + runs-on: blacksmith-16vcpu-ubuntu-2404 + steps: + - name: Checkout CLI + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Log in to GHCR + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Pull root Dockerfile smoke image + env: + IMAGE_REF: ${{ needs.root_dockerfile_image.outputs.image_ref }} + run: timeout 300s docker pull "$IMAGE_REF" + + - name: Setup Node environment for Bun smoke + uses: ./.github/actions/setup-node-env + with: + install-bun: "true" + install-deps: "true" + + - name: Run Bun global install image-provider smoke + env: + OPENCLAW_BUN_GLOBAL_SMOKE_DIST_IMAGE: ${{ needs.root_dockerfile_image.outputs.image_ref }} + OPENCLAW_BUN_GLOBAL_SMOKE_HOST_BUILD: "0" + run: bash scripts/e2e/bun-global-install-smoke.sh + docker-e2e-fast: needs: [preflight] if: needs.preflight.outputs.run_fast_install_smoke == 'true' || needs.preflight.outputs.run_full_install_smoke == 'true' diff --git a/.github/workflows/openclaw-release-checks.yml b/.github/workflows/openclaw-release-checks.yml index 7e3cbf0af4f..12e325aa822 100644 --- a/.github/workflows/openclaw-release-checks.yml +++ b/.github/workflows/openclaw-release-checks.yml @@ -245,6 +245,7 @@ jobs: timeout-minutes: 60 permissions: contents: read + packages: write outputs: artifact_name: ${{ steps.artifact.outputs.name }} package_sha256: ${{ steps.package.outputs.sha256 }} @@ -311,6 +312,7 @@ jobs: if: contains(fromJSON('["all","install-smoke"]'), needs.resolve_target.outputs.rerun_group) permissions: contents: read + packages: write uses: ./.github/workflows/install-smoke.yml with: ref: ${{ needs.resolve_target.outputs.revision }} diff --git a/docs/ci.md b/docs/ci.md index d1e98e675d1..b55136c6d8c 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -420,7 +420,7 @@ act as if every scoped area changed. CI workflow edits validate the Node CI graph plus workflow linting, but do not force Windows, Android, or macOS native builds by themselves; those platform lanes stay scoped to platform source changes. CI routing-only edits, selected cheap core-test fixture edits, and narrow plugin contract helper/test-routing edits use a fast Node-only manifest path: preflight, security, and a single `checks-fast-core` task. That path avoids build artifacts, Node 22 compatibility, channel contracts, full core shards, bundled-plugin shards, and additional guard matrices when the changed files are limited to the routing or helper surfaces that the fast task exercises directly. Windows Node checks are scoped to Windows-specific process/path wrappers, npm/pnpm/UI runner helpers, package manager config, and the CI workflow surfaces that execute that lane; unrelated source, plugin, install-smoke, and test-only changes stay on the Linux Node lanes so they do not reserve a 16-vCPU Windows worker for coverage that is already exercised by the normal test shards. -The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image, packs OpenClaw once as an npm tarball, and builds two shared `scripts/e2e/Dockerfile` images: a bare Node/Git runner for installer/update/plugin-dependency lanes and a functional image that installs the same tarball into `/app` for normal functionality lanes. Docker lane definitions live in `scripts/lib/docker-e2e-scenarios.mjs`, planner logic lives in `scripts/lib/docker-e2e-plan.mjs`, and the runner only executes the selected plan. The scheduler selects the image per lane with `OPENCLAW_DOCKER_E2E_BARE_IMAGE` and `OPENCLAW_DOCKER_E2E_FUNCTIONAL_IMAGE`, then runs lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=9`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=10`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. A single lane heavier than the effective caps can still start from an empty pool, then runs alone until it releases capacity. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. `OPENCLAW_DOCKER_ALL_LANES=` runs exact scheduler lanes, including release-only lanes such as `install-e2e` and split bundled update lanes such as `bundled-channel-update-acpx`, while skipping the cleanup smoke so agents can reproduce one failed lane. The reusable live/E2E workflow asks `scripts/test-docker-all.mjs --plan-json` which package, image kind, live image, lane, and credential coverage is required, then `scripts/docker-e2e.mjs` converts that plan into GitHub outputs and summaries. It either packs OpenClaw through `scripts/package-openclaw-for-docker.mjs`, downloads a current-run package artifact, or downloads a package artifact from `package_artifact_run_id`; validates the tarball inventory; builds and pushes package-digest-tagged bare/functional GHCR Docker E2E images through Blacksmith's Docker layer cache when the plan needs package-installed lanes; and reuses provided `docker_e2e_bare_image`/`docker_e2e_functional_image` inputs or existing package-digest images instead of rebuilding. Docker image pulls are retried with a bounded 180-second per-attempt timeout so a stuck registry/cache stream retries quickly instead of consuming most of the CI critical path. The `Package Acceptance` workflow is the high-level package gate: it resolves a candidate from npm, a trusted `package_ref`, an HTTPS tarball plus SHA-256, or a prior workflow artifact, then passes that single `package-under-test` artifact into the reusable Docker E2E workflow. It keeps `workflow_ref` separate from `package_ref` so current acceptance logic can validate older trusted commits without checking out old workflow code. Release checks run a custom Package Acceptance delta for the target ref: bundled-channel compat, offline plugin fixtures, and Telegram package QA against the resolved tarball. The release-path Docker suite runs smaller chunked jobs with `OPENCLAW_SKIP_DOCKER_BUILD=1` so each chunk pulls only the image kind it needs and executes multiple lanes through the same weighted scheduler (`OPENCLAW_DOCKER_ALL_PROFILE=release-path`, `OPENCLAW_DOCKER_ALL_CHUNK=core|package-update-openai|package-update-anthropic|package-update-core|plugins-runtime-plugins|plugins-runtime-services|plugins-runtime-install-a|plugins-runtime-install-b|plugins-runtime-install-c|plugins-runtime-install-d|plugins-runtime-install-e|plugins-runtime-install-f|plugins-runtime-install-g|plugins-runtime-install-h|bundled-channels`). OpenWebUI is folded into `plugins-runtime-services` when full release-path coverage requests it, and keeps a standalone `openwebui` chunk only for OpenWebUI-only dispatches. The legacy aggregate chunk names `package-update`, `plugins-runtime-core`, `plugins-runtime`, and `plugins-integrations` still work for manual reruns, but the release workflow uses the split chunks so installer E2E and bundled plugin install/uninstall sweeps do not dominate the critical path. The `install-e2e` lane alias remains the aggregate manual rerun alias for both provider installer lanes. The `bundled-channels` chunk runs split `bundled-channel-*` and `bundled-channel-update-*` lanes rather than the serial all-in-one `bundled-channel-deps` lane. Each chunk uploads `.artifacts/docker-tests/` with lane logs, timings, `summary.json`, `failures.json`, phase timings, scheduler plan JSON, slow-lane tables, and per-lane rerun commands. The workflow `docker_lanes` input runs selected lanes against the prepared images instead of the chunk jobs, which keeps failed-lane debugging bounded to one targeted Docker job and prepares, downloads, or reuses the package artifact for that run; if a selected lane is a live Docker lane, the targeted job builds the live-test image locally for that rerun. Generated per-lane GitHub rerun commands include `package_artifact_run_id`, `package_artifact_name`, and prepared image inputs when those values exist, so a failed lane can reuse the exact package and images from the failed run. Use `pnpm test:docker:rerun ` to download Docker artifacts from a GitHub run and print combined/per-lane targeted rerun commands; use `pnpm test:docker:timings ` for slow-lane and phase critical-path summaries. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks. +The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. In full mode, install-smoke prepares or reuses one target-SHA GHCR root Dockerfile smoke image, then runs QR package install, root Dockerfile/gateway smokes, installer/update smokes, and the fast bundled-plugin Docker E2E as separate jobs so installer work does not wait behind the root image smokes. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image, packs OpenClaw once as an npm tarball, and builds two shared `scripts/e2e/Dockerfile` images: a bare Node/Git runner for installer/update/plugin-dependency lanes and a functional image that installs the same tarball into `/app` for normal functionality lanes. Docker lane definitions live in `scripts/lib/docker-e2e-scenarios.mjs`, planner logic lives in `scripts/lib/docker-e2e-plan.mjs`, and the runner only executes the selected plan. The scheduler selects the image per lane with `OPENCLAW_DOCKER_E2E_BARE_IMAGE` and `OPENCLAW_DOCKER_E2E_FUNCTIONAL_IMAGE`, then runs lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=9`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=10`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. A single lane heavier than the effective caps can still start from an empty pool, then runs alone until it releases capacity. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. `OPENCLAW_DOCKER_ALL_LANES=` runs exact scheduler lanes, including release-only lanes such as `install-e2e` and split bundled update lanes such as `bundled-channel-update-acpx`, while skipping the cleanup smoke so agents can reproduce one failed lane. The reusable live/E2E workflow asks `scripts/test-docker-all.mjs --plan-json` which package, image kind, live image, lane, and credential coverage is required, then `scripts/docker-e2e.mjs` converts that plan into GitHub outputs and summaries. It either packs OpenClaw through `scripts/package-openclaw-for-docker.mjs`, downloads a current-run package artifact, or downloads a package artifact from `package_artifact_run_id`; validates the tarball inventory; builds and pushes package-digest-tagged bare/functional GHCR Docker E2E images through Blacksmith's Docker layer cache when the plan needs package-installed lanes; and reuses provided `docker_e2e_bare_image`/`docker_e2e_functional_image` inputs or existing package-digest images instead of rebuilding. Docker image pulls are retried with a bounded 180-second per-attempt timeout so a stuck registry/cache stream retries quickly instead of consuming most of the CI critical path. The `Package Acceptance` workflow is the high-level package gate: it resolves a candidate from npm, a trusted `package_ref`, an HTTPS tarball plus SHA-256, or a prior workflow artifact, then passes that single `package-under-test` artifact into the reusable Docker E2E workflow. It keeps `workflow_ref` separate from `package_ref` so current acceptance logic can validate older trusted commits without checking out old workflow code. Release checks run a custom Package Acceptance delta for the target ref: bundled-channel compat, offline plugin fixtures, and Telegram package QA against the resolved tarball. The release-path Docker suite runs smaller chunked jobs with `OPENCLAW_SKIP_DOCKER_BUILD=1` so each chunk pulls only the image kind it needs and executes multiple lanes through the same weighted scheduler (`OPENCLAW_DOCKER_ALL_PROFILE=release-path`, `OPENCLAW_DOCKER_ALL_CHUNK=core|package-update-openai|package-update-anthropic|package-update-core|plugins-runtime-plugins|plugins-runtime-services|plugins-runtime-install-a|plugins-runtime-install-b|plugins-runtime-install-c|plugins-runtime-install-d|plugins-runtime-install-e|plugins-runtime-install-f|plugins-runtime-install-g|plugins-runtime-install-h|bundled-channels`). OpenWebUI is folded into `plugins-runtime-services` when full release-path coverage requests it, and keeps a standalone `openwebui` chunk only for OpenWebUI-only dispatches. The legacy aggregate chunk names `package-update`, `plugins-runtime-core`, `plugins-runtime`, and `plugins-integrations` still work for manual reruns, but the release workflow uses the split chunks so installer E2E and bundled plugin install/uninstall sweeps do not dominate the critical path. The `install-e2e` lane alias remains the aggregate manual rerun alias for both provider installer lanes. The `bundled-channels` chunk runs split `bundled-channel-*` and `bundled-channel-update-*` lanes rather than the serial all-in-one `bundled-channel-deps` lane. Each chunk uploads `.artifacts/docker-tests/` with lane logs, timings, `summary.json`, `failures.json`, phase timings, scheduler plan JSON, slow-lane tables, and per-lane rerun commands. The workflow `docker_lanes` input runs selected lanes against the prepared images instead of the chunk jobs, which keeps failed-lane debugging bounded to one targeted Docker job and prepares, downloads, or reuses the package artifact for that run; if a selected lane is a live Docker lane, the targeted job builds the live-test image locally for that rerun. Generated per-lane GitHub rerun commands include `package_artifact_run_id`, `package_artifact_name`, and prepared image inputs when those values exist, so a failed lane can reuse the exact package and images from the failed run. Use `pnpm test:docker:rerun ` to download Docker artifacts from a GitHub run and print combined/per-lane targeted rerun commands; use `pnpm test:docker:timings ` for slow-lane and phase critical-path summaries. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks. Current release Docker chunks are `core`, `package-update-openai`, `package-update-anthropic`, `package-update-core`, `plugins-runtime-plugins`, `plugins-runtime-services`, `plugins-runtime-install-a`, `plugins-runtime-install-b`, `plugins-runtime-install-c`, `plugins-runtime-install-d`, `plugins-runtime-install-e`, `plugins-runtime-install-f`, `plugins-runtime-install-g`, `plugins-runtime-install-h`, `bundled-channels-core`, `bundled-channels-update-a`, `bundled-channels-update-discord`, `bundled-channels-update-b`, and `bundled-channels-contracts`. The aggregate `bundled-channels` chunk remains available for manual one-shot reruns, and `plugins-runtime-core`, `plugins-runtime`, and `plugins-integrations` remain aggregate plugin/runtime aliases, but the release workflow uses the split chunks so channel smokes, update targets, plugin runtime checks, and bundled plugin install/uninstall sweeps can run in parallel. Targeted `docker_lanes` dispatches also split multiple selected lanes into parallel jobs after one shared package/image preparation step, and bundled-channel update lanes retry once for transient npm network failures. diff --git a/docs/reference/RELEASING.md b/docs/reference/RELEASING.md index 4e760d67e22..f876f172c29 100644 --- a/docs/reference/RELEASING.md +++ b/docs/reference/RELEASING.md @@ -352,6 +352,9 @@ Docker environments instead of only source-level tests. Release Docker coverage includes: - full install smoke with the slow Bun global install smoke enabled +- root Dockerfile smoke image preparation/reuse by target SHA, with QR, + root/gateway, and installer/Bun smoke jobs running as separate install-smoke + shards - repository E2E lanes - release-path Docker chunks: `core`, `package-update-openai`, `package-update-anthropic`, `package-update-core`, `plugins-runtime-plugins`,