From 38caa6832d4e1d344736d635b0c5fdad37e4430b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 24 Apr 2026 08:38:37 +0100 Subject: [PATCH] test: stagger docker aggregate starts --- docs/ci.md | 2 +- docs/reference/test.md | 2 +- scripts/test-docker-all.mjs | 47 ++++++++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/docs/ci.md b/docs/ci.md index a3aa9de6b87..a8fc2379548 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -91,7 +91,7 @@ Jobs are ordered so cheap checks fail before expensive ones run: Scope logic lives in `scripts/ci-changed-scope.mjs` and is covered by unit tests in `src/scripts/ci-changed-scope.test.ts`. CI workflow edits validate the Node CI graph plus workflow linting, but do not force Windows, Android, or macOS native builds by themselves; those platform lanes stay scoped to platform source changes. Windows Node checks are scoped to Windows-specific process/path wrappers, npm/pnpm/UI runner helpers, package manager config, and the CI workflow surfaces that execute that lane; unrelated source, plugin, install-smoke, and test-only changes stay on the Linux Node lanes so they do not reserve a 16-vCPU Windows worker for coverage that is already exercised by the normal test shards. -The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 120-second command timeout. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes in parallel with `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool concurrency of 8 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool concurrency of 8 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. The local aggregate stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`. The reusable live/E2E workflow mirrors the shared-image pattern by building and pushing one SHA-tagged GHCR Docker E2E image before the Docker matrix, then running the matrix with `OPENCLAW_SKIP_DOCKER_BUILD=1`. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The full bundled update/channel matrix remains manual/full-suite because it performs repeated real npm update and doctor repair passes. +The separate `install-smoke` workflow reuses the same scope script through its own `preflight` job. It splits smoke coverage into `run_fast_install_smoke` and `run_full_install_smoke`. Pull requests run the fast path for Docker/package surfaces, bundled plugin package/manifest changes, and core plugin/channel/gateway/Plugin SDK surfaces that the Docker smoke jobs exercise. Source-only bundled plugin changes, test-only edits, and docs-only edits do not reserve Docker workers. The fast path builds the root Dockerfile image once, checks the CLI, runs the container gateway-network e2e, verifies a bundled extension build arg, and runs the bounded bundled-plugin Docker profile under a 120-second command timeout. The full path keeps QR package install and installer Docker/update coverage for nightly scheduled runs, manual dispatches, workflow-call release checks, and pull requests that truly touch installer/package/Docker surfaces. `main` pushes, including merge commits, do not force the full path; when changed-scope logic would request full coverage on a push, the workflow keeps the fast Docker smoke and leaves the full install smoke to nightly or release validation. The slow Bun global install image-provider smoke is separately gated by `run_bun_global_install_smoke`; it runs on the nightly schedule and from the release checks workflow, and manual `install-smoke` dispatches can opt into it, but pull requests and `main` pushes do not run it. QR and installer Docker tests keep their own install-focused Dockerfiles. Local `test:docker:all` prebuilds one shared live-test image and one shared `scripts/e2e/Dockerfile` built-app image, then runs the live/E2E smoke lanes in parallel with `OPENCLAW_SKIP_DOCKER_BUILD=1`; tune the default main-pool concurrency of 8 with `OPENCLAW_DOCKER_ALL_PARALLELISM` and the provider-sensitive tail-pool concurrency of 8 with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM`. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=0` or another millisecond value. The local aggregate stops scheduling new pooled lanes after the first failure by default, and each lane has a 120-minute timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`. The reusable live/E2E workflow mirrors the shared-image pattern by building and pushing one SHA-tagged GHCR Docker E2E image before the Docker matrix, then running the matrix with `OPENCLAW_SKIP_DOCKER_BUILD=1`. The scheduled live/E2E workflow runs the full release-path Docker suite daily. The full bundled update/channel matrix remains manual/full-suite because it performs repeated real npm update and doctor repair passes. Local changed-lane logic lives in `scripts/changed-lanes.mjs` and is executed by `scripts/check-changed.mjs`. That local gate is stricter about architecture boundaries than the broad CI platform scope: core production changes run core prod typecheck plus core tests, core test-only changes run only core test typecheck/tests, extension production changes run extension prod typecheck plus extension tests, and extension test-only changes run only extension test typecheck/tests. Public Plugin SDK or plugin-contract changes expand to extension validation because extensions depend on those core contracts. Release metadata-only version bumps run targeted version/config/root-dependency checks. Unknown root/config changes fail safe to all lanes. diff --git a/docs/reference/test.md b/docs/reference/test.md index a6947bfba1e..c2776a9950d 100644 --- a/docs/reference/test.md +++ b/docs/reference/test.md @@ -32,7 +32,7 @@ title: "Tests" - Gateway integration: opt-in via `OPENCLAW_TEST_INCLUDE_GATEWAY=1 pnpm test` or `pnpm test:gateway`. - `pnpm test:e2e`: Runs gateway end-to-end smoke tests (multi-instance WS/HTTP/node pairing). Defaults to `threads` + `isolate: false` with adaptive workers in `vitest.e2e.config.ts`; tune with `OPENCLAW_E2E_WORKERS=` and set `OPENCLAW_E2E_VERBOSE=1` for verbose logs. - `pnpm test:live`: Runs provider live tests (minimax/zai). Requires API keys and `LIVE=1` (or provider-specific `*_LIVE_TEST=1`) to unskip. -- `pnpm test:docker:all`: Builds the shared live-test image and Docker E2E image once, then runs the Docker smoke lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1` at concurrency 8 by default. Tune the main pool with `OPENCLAW_DOCKER_ALL_PARALLELISM=` and the provider-sensitive tail pool with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM=`; both default to 8. The runner stops scheduling new pooled lanes after the first failure unless `OPENCLAW_DOCKER_ALL_FAIL_FAST=0` is set, and each lane has a 120-minute timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`. Per-lane logs are written under `.artifacts/docker-tests//`. +- `pnpm test:docker:all`: Builds the shared live-test image and Docker E2E image once, then runs the Docker smoke lanes with `OPENCLAW_SKIP_DOCKER_BUILD=1` at concurrency 8 by default. Tune the main pool with `OPENCLAW_DOCKER_ALL_PARALLELISM=` and the provider-sensitive tail pool with `OPENCLAW_DOCKER_ALL_TAIL_PARALLELISM=`; both default to 8. Lane starts are staggered by 2 seconds by default to avoid local Docker daemon create storms; override with `OPENCLAW_DOCKER_ALL_START_STAGGER_MS=`. The runner stops scheduling new pooled lanes after the first failure unless `OPENCLAW_DOCKER_ALL_FAIL_FAST=0` is set, and each lane has a 120-minute timeout overrideable with `OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS`. Per-lane logs are written under `.artifacts/docker-tests//`. - `pnpm test:docker:openwebui`: Starts Dockerized OpenClaw + Open WebUI, signs in through Open WebUI, checks `/api/models`, then runs a real proxied chat through `/api/chat/completions`. Requires a usable live model key (for example OpenAI in `~/.profile`), pulls an external Open WebUI image, and is not expected to be CI-stable like the normal unit/e2e suites. - `pnpm test:docker:mcp-channels`: Starts a seeded Gateway container and a second client container that spawns `openclaw mcp serve`, then verifies routed conversation discovery, transcript reads, attachment metadata, live event queue behavior, outbound send routing, and Claude-style channel + permission notifications over the real stdio bridge. The Claude notification assertion reads the raw stdio MCP frames directly so the smoke reflects what the bridge actually emits. diff --git a/scripts/test-docker-all.mjs b/scripts/test-docker-all.mjs index fda4137ab96..391f5cbc75f 100644 --- a/scripts/test-docker-all.mjs +++ b/scripts/test-docker-all.mjs @@ -10,6 +10,7 @@ const DEFAULT_PARALLELISM = 8; const DEFAULT_TAIL_PARALLELISM = 8; const DEFAULT_FAILURE_TAIL_LINES = 80; const DEFAULT_LANE_TIMEOUT_MS = 120 * 60 * 1000; +const DEFAULT_LANE_START_STAGGER_MS = 2_000; const lanes = [ ["live-models", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-models"], @@ -68,6 +69,17 @@ function parsePositiveInt(raw, fallback, label) { return parsed; } +function parseNonNegativeInt(raw, fallback, label) { + if (!raw) { + return fallback; + } + const parsed = Number(raw); + if (!Number.isInteger(parsed) || parsed < 0) { + throw new Error(`${label} must be a non-negative integer. Got: ${JSON.stringify(raw)}`); + } + return parsed; +} + function parseBool(raw, fallback) { if (raw === undefined || raw === "") { return fallback; @@ -75,6 +87,12 @@ function parseBool(raw, fallback) { return !/^(?:0|false|no)$/i.test(raw); } +function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + function utcStampForPath() { return new Date().toISOString().replaceAll("-", "").replaceAll(":", "").replace(/\..*$/, "Z"); } @@ -209,6 +227,26 @@ async function runLane(lane, baseEnv, logDir, timeoutMs) { async function runLanePool(poolLanes, baseEnv, logDir, parallelism, options) { const failures = []; let nextIndex = 0; + let lastLaneStartAt = 0; + let laneStartQueue = Promise.resolve(); + + async function waitForLaneStartSlot() { + if (options.startStaggerMs <= 0) { + return; + } + const previous = laneStartQueue; + let releaseQueue; + laneStartQueue = new Promise((resolve) => { + releaseQueue = resolve; + }); + await previous; + const waitMs = Math.max(0, lastLaneStartAt + options.startStaggerMs - Date.now()); + if (waitMs > 0) { + await sleep(waitMs); + } + lastLaneStartAt = Date.now(); + releaseQueue(); + } async function worker() { while (nextIndex < poolLanes.length) { @@ -216,6 +254,7 @@ async function runLanePool(poolLanes, baseEnv, logDir, parallelism, options) { return; } const lane = poolLanes[nextIndex++]; + await waitForLaneStartSlot(); const result = await runLane(lane, baseEnv, logDir, options.timeoutMs); if (result.status !== 0) { failures.push(result); @@ -285,6 +324,11 @@ async function main() { DEFAULT_LANE_TIMEOUT_MS, "OPENCLAW_DOCKER_ALL_LANE_TIMEOUT_MS", ); + const laneStartStaggerMs = parseNonNegativeInt( + process.env.OPENCLAW_DOCKER_ALL_START_STAGGER_MS, + DEFAULT_LANE_START_STAGGER_MS, + "OPENCLAW_DOCKER_ALL_START_STAGGER_MS", + ); const failFast = parseBool(process.env.OPENCLAW_DOCKER_ALL_FAIL_FAST, true); const runId = process.env.OPENCLAW_DOCKER_ALL_RUN_ID || utcStampForPath(); const logDir = path.resolve( @@ -304,6 +348,7 @@ async function main() { console.log(`==> Parallelism: ${parallelism}`); console.log(`==> Tail parallelism: ${tailParallelism}`); console.log(`==> Lane timeout: ${laneTimeoutMs}ms`); + console.log(`==> Lane start stagger: ${laneStartStaggerMs}ms`); console.log(`==> Fail fast: ${failFast ? "yes" : "no"}`); console.log(`==> Live-test bundled plugin deps: ${baseEnv.OPENCLAW_DOCKER_BUILD_EXTENSIONS}`); @@ -314,7 +359,7 @@ async function main() { baseEnv, ); - const options = { failFast, timeoutMs: laneTimeoutMs }; + const options = { failFast, startStaggerMs: laneStartStaggerMs, timeoutMs: laneTimeoutMs }; const failures = await runLanePool(lanes, baseEnv, logDir, parallelism, options); if (failFast && failures.length > 0) { await printFailureSummary(failures, tailLines);