ci: chunk release Docker e2e jobs

This commit is contained in:
Peter Steinberger
2026-04-26 21:23:04 +01:00
parent 67ffa3df8b
commit ffa84cdc02
2 changed files with 90 additions and 92 deletions

View File

@@ -364,92 +364,22 @@ jobs:
validate_docker_e2e:
needs: [validate_selected_ref, prepare_docker_e2e_image]
if: inputs.include_release_path_suites
name: Docker E2E (${{ matrix.label }})
runs-on: blacksmith-32vcpu-ubuntu-2404
timeout-minutes: ${{ matrix.timeout_minutes }}
strategy:
fail-fast: false
matrix:
include:
- suite_id: docker-onboard
label: Onboarding Docker E2E
command: pnpm test:docker:onboard
timeout_minutes: 60
release_path: true
- suite_id: docker-npm-onboard-channel-agent
label: Npm Onboard Channel Agent Docker E2E
command: pnpm test:docker:npm-onboard-channel-agent
timeout_minutes: 90
release_path: true
- suite_id: docker-gateway-network
label: Gateway Network Docker E2E
command: pnpm test:docker:gateway-network
timeout_minutes: 60
release_path: true
- suite_id: docker-openai-web-search-minimal
label: OpenAI Web Search Minimal Docker E2E
command: pnpm test:docker:openai-web-search-minimal
timeout_minutes: 60
release_path: true
- suite_id: docker-mcp-channels
label: MCP Channels Docker E2E
command: pnpm test:docker:mcp-channels
timeout_minutes: 60
release_path: true
- suite_id: docker-pi-bundle-mcp-tools
label: Pi Bundle MCP Tools Docker E2E
command: pnpm test:docker:pi-bundle-mcp-tools
timeout_minutes: 60
release_path: true
- suite_id: docker-cron-mcp-cleanup
label: Cron MCP Cleanup Docker E2E
command: pnpm test:docker:cron-mcp-cleanup
timeout_minutes: 60
release_path: true
- suite_id: docker-plugins
label: Plugins Docker E2E
command: pnpm test:docker:plugins
timeout_minutes: 75
release_path: true
- suite_id: docker-plugin-update
label: Plugin Update Docker E2E
command: pnpm test:docker:plugin-update
timeout_minutes: 60
release_path: true
- suite_id: docker-config-reload
label: Config Reload Docker E2E
command: pnpm test:docker:config-reload
timeout_minutes: 60
release_path: true
- suite_id: docker-bundled-channel-deps
label: Bundled Channel Runtime Deps Docker E2E
command: pnpm test:docker:bundled-channel-deps
timeout_minutes: 75
release_path: true
- suite_id: docker-doctor-switch
label: Doctor Install Switch Docker E2E
command: pnpm test:docker:doctor-switch
timeout_minutes: 60
release_path: true
- suite_id: docker-update-channel-switch
label: Update Channel Switch Docker E2E
command: pnpm test:docker:update-channel-switch
timeout_minutes: 60
release_path: true
- suite_id: docker-session-runtime-context
label: Session Runtime Context Docker E2E
command: pnpm test:docker:session-runtime-context
timeout_minutes: 60
release_path: true
- suite_id: docker-qr
label: QR Import Docker E2E
command: pnpm test:docker:qr
timeout_minutes: 60
release_path: true
- suite_id: docker-install-e2e
label: Installer Docker E2E
command: pnpm test:install:e2e
- chunk_id: core
label: core
timeout_minutes: 120
release_path: true
- chunk_id: package-update
label: package/update
timeout_minutes: 180
- chunk_id: plugins-integrations
label: plugins/integrations
timeout_minutes: 180
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
@@ -497,6 +427,8 @@ jobs:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
OPENCLAW_DOCKER_E2E_IMAGE: ${{ needs.prepare_docker_e2e_image.outputs.image }}
OPENCLAW_SKIP_DOCKER_BUILD: "1"
INCLUDE_OPENWEBUI: ${{ inputs.include_openwebui }}
DOCKER_E2E_CHUNK: ${{ matrix.chunk_id }}
steps:
- name: Checkout selected ref
uses: actions/checkout@v6
@@ -521,22 +453,18 @@ jobs:
- name: Hydrate live auth/profile inputs
run: bash scripts/ci-hydrate-live-auth.sh
- name: Configure suite-specific env
- name: Pull shared Docker E2E image
shell: bash
run: |
set -euo pipefail
case "${{ matrix.suite_id }}" in
docker-install-e2e)
echo "OPENCLAW_E2E_MODELS=both" >> "$GITHUB_ENV"
;;
esac
docker pull "${OPENCLAW_DOCKER_E2E_IMAGE}"
- name: Validate suite credentials
- name: Validate chunk credentials
shell: bash
run: |
set -euo pipefail
case "${{ matrix.suite_id }}" in
docker-install-e2e)
case "${DOCKER_E2E_CHUNK}" in
package-update)
[[ -n "${OPENAI_API_KEY:-}" ]] || {
echo "OPENAI_API_KEY is required for installer Docker E2E." >&2
exit 1
@@ -546,14 +474,84 @@ jobs:
exit 1
fi
;;
plugins-integrations)
if [[ "${INCLUDE_OPENWEBUI}" == "true" ]]; then
[[ -n "${OPENAI_API_KEY:-}" ]] || {
echo "OPENAI_API_KEY is required for the Open WebUI Docker smoke." >&2
exit 1
}
fi
;;
esac
- name: Run ${{ matrix.label }}
run: ${{ matrix.command }}
- name: Run Docker E2E chunk
shell: bash
run: |
set -euo pipefail
failures=()
run_lane() {
local label="$1"
shift
echo "::group::${label}"
local status=0
"$@" || status=$?
echo "::endgroup::"
if [[ "$status" -ne 0 ]]; then
failures+=("${label} exited ${status}")
fi
}
run_openwebui_lane() {
if [[ "${INCLUDE_OPENWEBUI}" != "true" ]]; then
echo "Skipping Open WebUI Docker E2E because include_openwebui=false."
return 0
fi
run_lane "Open WebUI Docker E2E" pnpm test:docker:openwebui
}
case "${DOCKER_E2E_CHUNK}" in
core)
run_lane "QR Import Docker E2E" pnpm test:docker:qr
run_lane "Onboarding Docker E2E" pnpm test:docker:onboard
run_lane "Gateway Network Docker E2E" pnpm test:docker:gateway-network
run_lane "Config Reload Docker E2E" pnpm test:docker:config-reload
run_lane "Session Runtime Context Docker E2E" pnpm test:docker:session-runtime-context
run_lane "Pi Bundle MCP Tools Docker E2E" pnpm test:docker:pi-bundle-mcp-tools
run_lane "MCP Channels Docker E2E" pnpm test:docker:mcp-channels
;;
package-update)
run_lane "Installer Docker E2E" env OPENCLAW_E2E_MODELS=both pnpm test:install:e2e
run_lane "Npm Onboard Channel Agent Docker E2E" pnpm test:docker:npm-onboard-channel-agent
run_lane "Doctor Install Switch Docker E2E" pnpm test:docker:doctor-switch
run_lane "Update Channel Switch Docker E2E" pnpm test:docker:update-channel-switch
;;
plugins-integrations)
run_lane "Plugins Docker E2E" pnpm test:docker:plugins
run_lane "Plugin Update Docker E2E" pnpm test:docker:plugin-update
run_lane "Bundled Channel Runtime Deps Docker E2E" pnpm test:docker:bundled-channel-deps
run_lane "Cron MCP Cleanup Docker E2E" pnpm test:docker:cron-mcp-cleanup
run_lane "OpenAI Web Search Minimal Docker E2E" pnpm test:docker:openai-web-search-minimal
run_openwebui_lane
;;
*)
echo "Unknown Docker E2E chunk: ${DOCKER_E2E_CHUNK}" >&2
exit 1
;;
esac
if (( ${#failures[@]} > 0 )); then
printf 'Docker E2E chunk %s failed:\n' "${DOCKER_E2E_CHUNK}" >&2
printf -- '- %s\n' "${failures[@]}" >&2
exit 1
fi
validate_docker_openwebui:
needs: [validate_selected_ref, prepare_docker_e2e_image]
if: inputs.include_openwebui
if: inputs.include_openwebui && !inputs.include_release_path_suites
runs-on: blacksmith-32vcpu-ubuntu-2404
timeout-minutes: 75
env:

View File

@@ -92,7 +92,7 @@ Scope logic lives in `scripts/ci-changed-scope.mjs` and is covered by unit tests
CI workflow edits validate the Node CI graph plus workflow linting, but do not force Windows, Android, or macOS native builds by themselves; those platform lanes stay scoped to platform source changes.
CI routing-only edits, selected cheap core-test fixture edits, and narrow plugin contract helper/test-routing edits use a fast Node-only manifest path: preflight, security, and a single `checks-fast-core` task. That path avoids build artifacts, Node 22 compatibility, channel contracts, full core shards, bundled-plugin shards, and additional guard matrices when the changed files are limited to the routing or helper surfaces that the fast task exercises directly.
Windows Node checks are scoped to Windows-specific process/path wrappers, npm/pnpm/UI runner helpers, package manager config, and the CI workflow surfaces that execute that lane; unrelated source, plugin, install-smoke, and test-only changes stay on the Linux Node lanes so they do not reserve a 16-vCPU Windows worker for coverage that is already exercised by the normal test shards.
The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes with a weighted scheduler and `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=6`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=8`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. The reusable live/E2E workflow mirrors the shared-image pattern by building and pushing one SHA-tagged GHCR Docker E2E image before the Docker matrix, then running the matrix with `OPENCLAW_SKIP_DOCKER_BUILD=1`. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks.
The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the agents delete shared-workspace CLI smoke, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 240-second aggregate command timeout with each scenario's Docker run capped separately. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes with a weighted scheduler and `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool slot count of 10 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Heavy lane caps default to `OPENCLAW_DOCKER_ALL_LIVE_LIMIT=6`, `OPENCLAW_DOCKER_ALL_NPM_LIMIT=8`, and `OPENCLAW_DOCKER_ALL_SERVICE_LIMIT=7` so npm install and multi-service lanes do not overcommit Docker while lighter lanes still fill available slots. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate preflights Docker, removes stale OpenClaw E2E containers, emits active-lane status, persists lane timings for longest-first ordering, and supports `OPENCLAW_DOCKER_ALL_DRY_RUN=1` for scheduler inspection. It stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute fallback timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`; selected live/tail lanes use tighter per-lane caps. The reusable live/E2E workflow builds and pushes one SHA-tagged GHCR Docker E2E image, then runs the release-path Docker suite as at most three chunked jobs with `OPENCLAW_SKIP_DOCKER_BUILD=1` so each chunk pulls the shared image once and executes multiple lanes. When Open WebUI is requested with the release-path suite, it runs inside the plugins/integrations chunk instead of reserving a fourth Docker worker; Open WebUI keeps a standalone job only for openwebui-only dispatches. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The bundled update matrix is split by update target so repeated npm update and doctor repair passes can shard with other bundled checks.
Local changed-lane logic lives in `scripts/changed-lanes.mjs` and is executed by `scripts/check-changed.mjs`. That local gate is stricter about architecture boundaries than the broad CI platform scope: core production changes run core prod typecheck plus core tests, core test-only changes run only core test typecheck/tests, extension production changes run extension prod typecheck plus extension tests, and extension test-only changes run only extension test typecheck/tests. Public Plugin SDK or plugin-contract changes expand to extension validation because extensions depend on those core contracts. Release metadata-only version bumps run targeted version/config/root-dependency checks. Unknown root/config changes fail safe to all lanes.