From ffa84cdc025b3396d5044e322fbe6ff27cbeaf22 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 26 Apr 2026 21:23:04 +0100 Subject: [PATCH] ci: chunk release Docker e2e jobs --- .../openclaw-live-and-e2e-checks-reusable.yml | 180 +++++++++--------- docs/ci.md | 2 +- 2 files changed, 90 insertions(+), 92 deletions(-) diff --git a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml index cbe4ae1a639..712b0aee5be 100644 --- a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml +++ b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml @@ -364,92 +364,22 @@ jobs: validate_docker_e2e: needs: [validate_selected_ref, prepare_docker_e2e_image] if: inputs.include_release_path_suites + name: Docker E2E (${{ matrix.label }}) runs-on: blacksmith-32vcpu-ubuntu-2404 timeout-minutes: ${{ matrix.timeout_minutes }} strategy: fail-fast: false matrix: include: - - suite_id: docker-onboard - label: Onboarding Docker E2E - command: pnpm test:docker:onboard - timeout_minutes: 60 - release_path: true - - suite_id: docker-npm-onboard-channel-agent - label: Npm Onboard Channel Agent Docker E2E - command: pnpm test:docker:npm-onboard-channel-agent - timeout_minutes: 90 - release_path: true - - suite_id: docker-gateway-network - label: Gateway Network Docker E2E - command: pnpm test:docker:gateway-network - timeout_minutes: 60 - release_path: true - - suite_id: docker-openai-web-search-minimal - label: OpenAI Web Search Minimal Docker E2E - command: pnpm test:docker:openai-web-search-minimal - timeout_minutes: 60 - release_path: true - - suite_id: docker-mcp-channels - label: MCP Channels Docker E2E - command: pnpm test:docker:mcp-channels - timeout_minutes: 60 - release_path: true - - suite_id: docker-pi-bundle-mcp-tools - label: Pi Bundle MCP Tools Docker E2E - command: pnpm test:docker:pi-bundle-mcp-tools - timeout_minutes: 60 - release_path: true - - suite_id: docker-cron-mcp-cleanup - label: Cron MCP Cleanup Docker E2E - command: pnpm test:docker:cron-mcp-cleanup - timeout_minutes: 60 - release_path: true - - suite_id: docker-plugins - label: Plugins Docker E2E - command: pnpm test:docker:plugins - timeout_minutes: 75 - release_path: true - - suite_id: docker-plugin-update - label: Plugin Update Docker E2E - command: pnpm test:docker:plugin-update - timeout_minutes: 60 - release_path: true - - suite_id: docker-config-reload - label: Config Reload Docker E2E - command: pnpm test:docker:config-reload - timeout_minutes: 60 - release_path: true - - suite_id: docker-bundled-channel-deps - label: Bundled Channel Runtime Deps Docker E2E - command: pnpm test:docker:bundled-channel-deps - timeout_minutes: 75 - release_path: true - - suite_id: docker-doctor-switch - label: Doctor Install Switch Docker E2E - command: pnpm test:docker:doctor-switch - timeout_minutes: 60 - release_path: true - - suite_id: docker-update-channel-switch - label: Update Channel Switch Docker E2E - command: pnpm test:docker:update-channel-switch - timeout_minutes: 60 - release_path: true - - suite_id: docker-session-runtime-context - label: Session Runtime Context Docker E2E - command: pnpm test:docker:session-runtime-context - timeout_minutes: 60 - release_path: true - - suite_id: docker-qr - label: QR Import Docker E2E - command: pnpm test:docker:qr - timeout_minutes: 60 - release_path: true - - suite_id: docker-install-e2e - label: Installer Docker E2E - command: pnpm test:install:e2e + - chunk_id: core + label: core timeout_minutes: 120 - release_path: true + - chunk_id: package-update + label: package/update + timeout_minutes: 180 + - chunk_id: plugins-integrations + label: plugins/integrations + timeout_minutes: 180 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} @@ -497,6 +427,8 @@ jobs: FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} OPENCLAW_DOCKER_E2E_IMAGE: ${{ needs.prepare_docker_e2e_image.outputs.image }} OPENCLAW_SKIP_DOCKER_BUILD: "1" + INCLUDE_OPENWEBUI: ${{ inputs.include_openwebui }} + DOCKER_E2E_CHUNK: ${{ matrix.chunk_id }} steps: - name: Checkout selected ref uses: actions/checkout@v6 @@ -521,22 +453,18 @@ jobs: - name: Hydrate live auth/profile inputs run: bash scripts/ci-hydrate-live-auth.sh - - name: Configure suite-specific env + - name: Pull shared Docker E2E image shell: bash run: | set -euo pipefail - case "${{ matrix.suite_id }}" in - docker-install-e2e) - echo "OPENCLAW_E2E_MODELS=both" >> "$GITHUB_ENV" - ;; - esac + docker pull "${OPENCLAW_DOCKER_E2E_IMAGE}" - - name: Validate suite credentials + - name: Validate chunk credentials shell: bash run: | set -euo pipefail - case "${{ matrix.suite_id }}" in - docker-install-e2e) + case "${DOCKER_E2E_CHUNK}" in + package-update) [[ -n "${OPENAI_API_KEY:-}" ]] || { echo "OPENAI_API_KEY is required for installer Docker E2E." >&2 exit 1 @@ -546,14 +474,84 @@ jobs: exit 1 fi ;; + plugins-integrations) + if [[ "${INCLUDE_OPENWEBUI}" == "true" ]]; then + [[ -n "${OPENAI_API_KEY:-}" ]] || { + echo "OPENAI_API_KEY is required for the Open WebUI Docker smoke." >&2 + exit 1 + } + fi + ;; esac - - name: Run ${{ matrix.label }} - run: ${{ matrix.command }} + - name: Run Docker E2E chunk + shell: bash + run: | + set -euo pipefail + + failures=() + + run_lane() { + local label="$1" + shift + + echo "::group::${label}" + local status=0 + "$@" || status=$? + echo "::endgroup::" + + if [[ "$status" -ne 0 ]]; then + failures+=("${label} exited ${status}") + fi + } + + run_openwebui_lane() { + if [[ "${INCLUDE_OPENWEBUI}" != "true" ]]; then + echo "Skipping Open WebUI Docker E2E because include_openwebui=false." + return 0 + fi + run_lane "Open WebUI Docker E2E" pnpm test:docker:openwebui + } + + case "${DOCKER_E2E_CHUNK}" in + core) + run_lane "QR Import Docker E2E" pnpm test:docker:qr + run_lane "Onboarding Docker E2E" pnpm test:docker:onboard + run_lane "Gateway Network Docker E2E" pnpm test:docker:gateway-network + run_lane "Config Reload Docker E2E" pnpm test:docker:config-reload + run_lane "Session Runtime Context Docker E2E" pnpm test:docker:session-runtime-context + run_lane "Pi Bundle MCP Tools Docker E2E" pnpm test:docker:pi-bundle-mcp-tools + run_lane "MCP Channels Docker E2E" pnpm test:docker:mcp-channels + ;; + package-update) + run_lane "Installer Docker E2E" env OPENCLAW_E2E_MODELS=both pnpm test:install:e2e + run_lane "Npm Onboard Channel Agent Docker E2E" pnpm test:docker:npm-onboard-channel-agent + run_lane "Doctor Install Switch Docker E2E" pnpm test:docker:doctor-switch + run_lane "Update Channel Switch Docker E2E" pnpm test:docker:update-channel-switch + ;; + plugins-integrations) + run_lane "Plugins Docker E2E" pnpm test:docker:plugins + run_lane "Plugin Update Docker E2E" pnpm test:docker:plugin-update + run_lane "Bundled Channel Runtime Deps Docker E2E" pnpm test:docker:bundled-channel-deps + run_lane "Cron MCP Cleanup Docker E2E" pnpm test:docker:cron-mcp-cleanup + run_lane "OpenAI Web Search Minimal Docker E2E" pnpm test:docker:openai-web-search-minimal + run_openwebui_lane + ;; + *) + echo "Unknown Docker E2E chunk: ${DOCKER_E2E_CHUNK}" >&2 + exit 1 + ;; + esac + + if (( ${#failures[@]} > 0 )); then + printf 'Docker E2E chunk %s failed:\n' "${DOCKER_E2E_CHUNK}" >&2 + printf -- '- %s\n' "${failures[@]}" >&2 + exit 1 + fi validate_docker_openwebui: needs: [validate_selected_ref, prepare_docker_e2e_image] - if: inputs.include_openwebui + if: inputs.include_openwebui && !inputs.include_release_path_suites runs-on: blacksmith-32vcpu-ubuntu-2404 timeout-minutes: 75 env: diff --git a/docs/ci.md b/docs/ci.md index d458b7a1ec0..a3f7fcbc18a 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -92,7 +92,7 @@ Scope logic lives in `scripts/ci-changed-scope.mjs` and is covered by unit tests CI workflow edits validate the Node CI graph plus workflow linting, but do not force Windows, Android, or macOS native builds by themselves; those platform lanes stay scoped to platform source changes. CI routing-only edits, selected cheap core-test fixture edits, and narrow plugin contract helper/test-routing edits use a fast Node-only manifest path: preflight, security, and a single `checks-fast-core` task. That path avoids build artifacts, Node 22 compatibility, channel contracts, full core shards, bundled-plugin shards, and additional guard matrices when the changed files are limited to the routing or helper surfaces that the fast task exercises directly. Windows Node checks are scoped to Windows-specific process/path wrappers, npm/pnpm/UI runner helpers, package manager config, and the CI workflow surfaces that execute that lane; unrelated source, plugin, install-smoke, and test-only changes stay on the Linux Node lanes so they do not reserve a 16-vCPU Windows worker for coverage that is already exercised by the normal test shards. -The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes with a weighted scheduler and `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=6`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=8`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. The reusable live/E2E workflow mirrors the shared-image pattern by building and pushing one SHA-tagged GHCR Docker E2E image before the Docker matrix, then running the matrix with `OPENCLAW_SKIP_DOCKER_BUILD=1`. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks. +The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes with a weighted scheduler and `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=6`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=8`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. The reusable live/E2E workflow builds and pushes one SHA-tagged GHCR Docker E2E image, then runs the release-path Docker suite as at most three chunked jobs with `OPENCLAW_SKIP_DOCKER_BUILD=1` so each chunk pulls the shared image once and executes multiple lanes. When Open WebUI is requested with the release-path suite, it runs inside the plugins/integrations chunk instead of reserving a fourth Docker worker; Open WebUI keeps a standalone job only for openwebui-only dispatches. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks. Local changed-lane logic lives in `scripts/changed-lanes.mjs` and is executed by `scripts/check-changed.mjs`. That local gate is stricter about architecture boundaries than the broad CI platform scope: core production changes run core prod typecheck plus core tests, core test-only changes run only core test typecheck/tests, extension production changes run extension prod typecheck plus extension tests, and extension test-only changes run only extension test typecheck/tests. Public Plugin SDK or plugin-contract changes expand to extension validation because extensions depend on those core contracts. Release metadata-only version bumps run targeted version/config/root-dependency checks. Unknown root/config changes fail safe to all lanes.