test: speed up Docker live scheduling

This commit is contained in:
Peter Steinberger
2026-04-25 10:01:41 +01:00
parent 6bdf87de87
commit 70b3ba2fed
4 changed files with 207 additions and 30 deletions

View File

@@ -11,6 +11,7 @@ const DEFAULT_TAIL_PARALLELISM = 10;
const DEFAULT_FAILURE_TAIL_LINES = 80;
const DEFAULT_LANE_TIMEOUT_MS = 120 * 60 * 1000;
const DEFAULT_LANE_START_STAGGER_MS = 2_000;
const DEFAULT_LIVE_RETRIES = 1;
const DEFAULT_STATUS_INTERVAL_MS = 30_000;
const DEFAULT_PREFLIGHT_RUN_TIMEOUT_MS = 60_000;
const DEFAULT_TIMINGS_FILE = path.join(ROOT_DIR, ".artifacts/docker-tests/lane-timings.json");
@@ -21,29 +22,71 @@ const OPENWEBUI_TIMEOUT_MS = 20 * 60 * 1000;
const BUNDLED_UPDATE_TIMEOUT_MS = 20 * 60 * 1000;
const DEFAULT_RESOURCE_LIMITS = {
docker: DEFAULT_PARALLELISM,
live: 6,
npm: 8,
live: 9,
"live:claude": 4,
"live:codex": 4,
"live:gemini": 4,
npm: 10,
service: 7,
};
const LIVE_RETRY_PATTERNS = [
/529\b/i,
/overloaded/i,
/capacity/i,
/rate.?limit/i,
/gateway closed \(1000 normal closure\)/i,
/ECONNRESET|ETIMEDOUT|ENOTFOUND/i,
];
const bundledChannelLaneCommand =
"OPENCLAW_SKIP_DOCKER_BUILD=1 OPENCLAW_BUNDLED_CHANNEL_UPDATE_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_ROOT_OWNED_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_SETUP_ENTRY_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_LOAD_FAILURE_SCENARIO=0 OPENCLAW_BUNDLED_CHANNEL_DISABLED_CONFIG_SCENARIO=0 pnpm test:docker:bundled-channel-deps";
function lane(name, command, options = {}) {
return {
cacheKey: options.cacheKey,
command,
estimateSeconds: options.estimateSeconds,
live: options.live === true,
name,
retryPatterns: options.retryPatterns ?? [],
retries: options.retries ?? 0,
resources: options.resources ?? [],
timeoutMs: options.timeoutMs,
weight: options.weight ?? 1,
};
}
function liveProviderResource(provider) {
if (!provider) {
return undefined;
}
if (provider === "claude-cli" || provider === "claude") {
return "live:claude";
}
if (provider === "codex-cli" || provider === "codex") {
return "live:codex";
}
if (provider === "google-gemini-cli" || provider === "gemini") {
return "live:gemini";
}
if (provider === "openai") {
return "live:openai";
}
return `live:${provider}`;
}
function liveProviderResources(options) {
const providers = options.providers ?? (options.provider ? [options.provider] : []);
return providers.map(liveProviderResource).filter(Boolean);
}
function liveLane(name, command, options = {}) {
return lane(name, command, {
...options,
resources: ["live", ...(options.resources ?? [])],
live: true,
resources: ["live", ...liveProviderResources(options), ...(options.resources ?? [])],
retryPatterns: options.retryPatterns ?? LIVE_RETRY_PATTERNS,
retries: options.retries ?? DEFAULT_LIVE_RETRIES,
weight: options.weight ?? 3,
});
}
@@ -132,22 +175,36 @@ const bundledScenarioLanes = [
const lanes = [
liveLane("live-models", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-models", {
providers: ["claude-cli", "codex-cli", "google-gemini-cli"],
timeoutMs: LIVE_PROFILE_TIMEOUT_MS,
weight: 4,
}),
liveLane("live-gateway", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-gateway", {
providers: ["claude-cli", "codex-cli", "google-gemini-cli"],
timeoutMs: LIVE_PROFILE_TIMEOUT_MS,
weight: 4,
}),
liveLane(
"live-cli-backend-claude",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:claude",
{ resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
{
cacheKey: "cli-backend-claude",
provider: "claude-cli",
resources: ["npm"],
timeoutMs: LIVE_CLI_TIMEOUT_MS,
weight: 3,
},
),
liveLane(
"live-cli-backend-gemini",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:gemini",
{ resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
{
cacheKey: "cli-backend-gemini",
provider: "google-gemini-cli",
resources: ["npm"],
timeoutMs: LIVE_CLI_TIMEOUT_MS,
weight: 3,
},
),
serviceLane("openwebui", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:openwebui", {
timeoutMs: OPENWEBUI_TIMEOUT_MS,
@@ -204,9 +261,17 @@ const exclusiveLanes = [
liveLane(
"live-codex-harness",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-codex-harness",
{ resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
{
cacheKey: "codex-harness",
provider: "codex-cli",
resources: ["npm"],
timeoutMs: LIVE_ACP_TIMEOUT_MS,
weight: 3,
},
),
liveLane("live-codex-bind", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-codex-bind", {
cacheKey: "codex-harness",
provider: "codex-cli",
resources: ["npm"],
timeoutMs: LIVE_ACP_TIMEOUT_MS,
weight: 3,
@@ -214,22 +279,46 @@ const exclusiveLanes = [
liveLane(
"live-cli-backend-codex",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-cli-backend:codex",
{ resources: ["npm"], timeoutMs: LIVE_CLI_TIMEOUT_MS, weight: 3 },
{
cacheKey: "cli-backend-codex",
provider: "codex-cli",
resources: ["npm"],
timeoutMs: LIVE_CLI_TIMEOUT_MS,
weight: 3,
},
),
liveLane(
"live-acp-bind-claude",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:claude",
{ resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
{
cacheKey: "acp-bind-claude",
provider: "claude-cli",
resources: ["npm"],
timeoutMs: LIVE_ACP_TIMEOUT_MS,
weight: 3,
},
),
liveLane(
"live-acp-bind-codex",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:codex",
{ resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
{
cacheKey: "acp-bind-codex",
provider: "codex-cli",
resources: ["npm"],
timeoutMs: LIVE_ACP_TIMEOUT_MS,
weight: 3,
},
),
liveLane(
"live-acp-bind-gemini",
"OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:live-acp-bind:gemini",
{ resources: ["npm"], timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3 },
{
cacheKey: "acp-bind-gemini",
provider: "google-gemini-cli",
resources: ["npm"],
timeoutMs: LIVE_ACP_TIMEOUT_MS,
weight: 3,
},
),
];
@@ -264,8 +353,39 @@ function parseBool(raw, fallback) {
return !/^(?:0|false|no)$/i.test(raw);
}
function parseLiveMode(raw) {
const mode = raw || "all";
if (mode === "all" || mode === "skip" || mode === "only") {
return mode;
}
throw new Error(
`OPENCLAW_DOCKER_ALL_LIVE_MODE must be one of: all, skip, only. Got: ${JSON.stringify(raw)}`,
);
}
function applyLiveMode(poolLanes, mode) {
if (mode === "all") {
return poolLanes;
}
return poolLanes.filter((poolLane) => (mode === "only" ? poolLane.live : !poolLane.live));
}
function applyLiveRetries(poolLanes, retries) {
return poolLanes.map((poolLane) => (poolLane.live ? { ...poolLane, retries } : poolLane));
}
function resourceLimitsSummary(resourceLimits) {
return Object.entries(resourceLimits)
.map(([resource, limit]) => `${resource}=${String(limit)}`)
.join(" ");
}
function resourceLimitEnvName(resource) {
return `OPENCLAW_DOCKER_ALL_${resource.toUpperCase().replace(/[^A-Z0-9]+/g, "_")}_LIMIT`;
}
function parseResourceLimit(env, resource, parallelism, fallback) {
const envName = `OPENCLAW_DOCKER_ALL_${resource.toUpperCase()}_LIMIT`;
const envName = resourceLimitEnvName(resource);
return parsePositiveInt(env[envName], Math.min(parallelism, fallback), envName);
}
@@ -275,13 +395,12 @@ function parseSchedulerOptions(env, parallelism) {
parallelism,
"OPENCLAW_DOCKER_ALL_WEIGHT_LIMIT",
);
const resourceLimits = {};
for (const [resource, fallback] of Object.entries(DEFAULT_RESOURCE_LIMITS)) {
resourceLimits[resource] = parseResourceLimit(env, resource, parallelism, fallback);
}
return {
resourceLimits: {
docker: parseResourceLimit(env, "docker", parallelism, parallelism),
live: parseResourceLimit(env, "live", parallelism, DEFAULT_RESOURCE_LIMITS.live),
npm: parseResourceLimit(env, "npm", parallelism, DEFAULT_RESOURCE_LIMITS.npm),
service: parseResourceLimit(env, "service", parallelism, DEFAULT_RESOURCE_LIMITS.service),
},
resourceLimits,
weightLimit,
};
}
@@ -297,7 +416,9 @@ function laneResources(poolLane) {
function laneSummary(poolLane) {
const resources = laneResources(poolLane).join(",");
const timeout = poolLane.timeoutMs ? ` timeout=${Math.round(poolLane.timeoutMs / 1000)}s` : "";
return `${poolLane.name}(w=${laneWeight(poolLane)} r=${resources}${timeout})`;
const retries = poolLane.retries > 0 ? ` retries=${poolLane.retries}` : "";
const cache = poolLane.cacheKey ? ` cache=${poolLane.cacheKey}` : "";
return `${poolLane.name}(w=${laneWeight(poolLane)} r=${resources}${timeout}${retries}${cache})`;
}
function sleep(ms) {
@@ -638,15 +759,16 @@ async function prepareBundledChannelPackage(baseEnv, logDir) {
console.log(`==> Bundled channel package: ${baseEnv.OPENCLAW_BUNDLED_CHANNEL_PACKAGE_TGZ}`);
}
function laneEnv(name, baseEnv, logDir) {
function laneEnv(name, baseEnv, logDir, cacheKey) {
const env = {
...baseEnv,
};
const cacheName = cacheKey || name;
if (!process.env.OPENCLAW_DOCKER_CLI_TOOLS_DIR) {
env.OPENCLAW_DOCKER_CLI_TOOLS_DIR = path.join(logDir, `${name}-cli-tools`);
env.OPENCLAW_DOCKER_CLI_TOOLS_DIR = path.join(logDir, `${cacheName}-cli-tools`);
}
if (!process.env.OPENCLAW_DOCKER_CACHE_HOME_DIR) {
env.OPENCLAW_DOCKER_CACHE_HOME_DIR = path.join(logDir, `${name}-cache`);
env.OPENCLAW_DOCKER_CACHE_HOME_DIR = path.join(logDir, `${cacheName}-cache`);
}
return env;
}
@@ -655,7 +777,7 @@ async function runLane(lane, baseEnv, logDir, fallbackTimeoutMs) {
const { command, name } = lane;
const timeoutMs = lane.timeoutMs ?? fallbackTimeoutMs;
const logFile = path.join(logDir, `${name}.log`);
const env = laneEnv(name, baseEnv, logDir);
const env = laneEnv(name, baseEnv, logDir, lane.cacheKey);
await mkdir(env.OPENCLAW_DOCKER_CLI_TOOLS_DIR, { recursive: true });
await mkdir(env.OPENCLAW_DOCKER_CACHE_HOME_DIR, { recursive: true });
await fs.promises.writeFile(
@@ -664,12 +786,29 @@ async function runLane(lane, baseEnv, logDir, fallbackTimeoutMs) {
`==> [${name}] cli tools dir: ${env.OPENCLAW_DOCKER_CLI_TOOLS_DIR}`,
`==> [${name}] cache dir: ${env.OPENCLAW_DOCKER_CACHE_HOME_DIR}`,
`==> [${name}] timeout: ${timeoutMs}ms`,
`==> [${name}] retries: ${lane.retries ?? 0}`,
"",
].join("\n"),
);
console.log(`==> [${name}] start`);
const startedAt = Date.now();
const result = await runShellCommand({ command, env, label: name, logFile, timeoutMs });
let result;
const maxAttempts = 1 + Math.max(0, lane.retries ?? 0);
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
if (attempt > 1) {
await fs.promises.appendFile(logFile, `\n==> [${name}] retry attempt ${attempt}\n`);
console.log(`==> [${name}] retry ${attempt}/${maxAttempts}`);
}
result = await runShellCommand({ command, env, label: name, logFile, timeoutMs });
if (result.status === 0 || attempt >= maxAttempts) {
break;
}
const retryable =
result.timedOut || (await laneLogMatchesRetryPattern(logFile, lane.retryPatterns));
if (!retryable) {
break;
}
}
const elapsedSeconds = Math.round((Date.now() - startedAt) / 1000);
if (result.status === 0) {
console.log(`==> [${name}] pass ${elapsedSeconds}s`);
@@ -847,6 +986,14 @@ async function tailFile(file, lines) {
return tail.trimEnd();
}
async function laneLogMatchesRetryPattern(logFile, patterns) {
if (!patterns || patterns.length === 0) {
return false;
}
const tail = await tailFile(logFile, 160);
return patterns.some((pattern) => pattern.test(tail));
}
async function printFailureSummary(failures, tailLines) {
console.error(`ERROR: ${failures.length} Docker lane(s) failed.`);
for (const failure of failures) {
@@ -927,6 +1074,12 @@ async function main() {
const preflightEnabled = parseBool(process.env.OPENCLAW_DOCKER_ALL_PREFLIGHT, true);
const preflightCleanup = parseBool(process.env.OPENCLAW_DOCKER_ALL_PREFLIGHT_CLEANUP, true);
const timingsEnabled = parseBool(process.env.OPENCLAW_DOCKER_ALL_TIMINGS, true);
const liveMode = parseLiveMode(process.env.OPENCLAW_DOCKER_ALL_LIVE_MODE);
const liveRetries = parseNonNegativeInt(
process.env.OPENCLAW_DOCKER_ALL_LIVE_RETRIES,
DEFAULT_LIVE_RETRIES,
"OPENCLAW_DOCKER_ALL_LIVE_RETRIES",
);
const timingsFile = path.resolve(
process.env.OPENCLAW_DOCKER_ALL_TIMINGS_FILE || DEFAULT_TIMINGS_FILE,
);
@@ -945,13 +1098,22 @@ async function main() {
appendExtension(baseEnv, "codex");
const timingStore = await loadTimingStore(timingsFile, timingsEnabled);
const orderedLanes = orderLanes(lanes, timingStore);
const orderedTailLanes = orderLanes(tailLanes, timingStore);
const retriedMainLanes = applyLiveRetries(lanes, liveRetries);
const retriedTailLanes = applyLiveRetries(tailLanes, liveRetries);
const configuredLanes =
liveMode === "only"
? applyLiveMode([...retriedMainLanes, ...retriedTailLanes], liveMode)
: applyLiveMode(retriedMainLanes, liveMode);
const configuredTailLanes = liveMode === "only" ? [] : applyLiveMode(retriedTailLanes, liveMode);
const orderedLanes = orderLanes(configuredLanes, timingStore);
const orderedTailLanes = orderLanes(configuredTailLanes, timingStore);
console.log(`==> Docker test logs: ${logDir}`);
console.log(`==> Parallelism: ${parallelism}`);
console.log(`==> Tail parallelism: ${tailParallelism}`);
console.log(`==> Lane timeout: ${laneTimeoutMs}ms`);
console.log(`==> Live mode: ${liveMode}`);
console.log(`==> Live retries: ${liveRetries}`);
console.log(`==> Lane start stagger: ${laneStartStaggerMs}ms`);
console.log(`==> Status interval: ${statusIntervalMs}ms`);
console.log(`==> Fail fast: ${failFast ? "yes" : "no"}`);
@@ -966,10 +1128,10 @@ async function main() {
const schedulerOptions = parseSchedulerOptions(process.env, parallelism);
const tailSchedulerOptions = parseSchedulerOptions(process.env, tailParallelism);
console.log(
`==> Scheduler: weight=${schedulerOptions.weightLimit} docker=${schedulerOptions.resourceLimits.docker} live=${schedulerOptions.resourceLimits.live} npm=${schedulerOptions.resourceLimits.npm} service=${schedulerOptions.resourceLimits.service}`,
`==> Scheduler: weight=${schedulerOptions.weightLimit} ${resourceLimitsSummary(schedulerOptions.resourceLimits)}`,
);
console.log(
`==> Tail scheduler: weight=${tailSchedulerOptions.weightLimit} docker=${tailSchedulerOptions.resourceLimits.docker} live=${tailSchedulerOptions.resourceLimits.live} npm=${tailSchedulerOptions.resourceLimits.npm} service=${tailSchedulerOptions.resourceLimits.service}`,
`==> Tail scheduler: weight=${tailSchedulerOptions.weightLimit} ${resourceLimitsSummary(tailSchedulerOptions.resourceLimits)}`,
);
printLaneManifest("Main", orderedLanes, timingStore);
printLaneManifest("Tail", orderedTailLanes, timingStore);

View File

@@ -13,6 +13,7 @@ CLI_MODEL="${OPENCLAW_LIVE_CLI_BACKEND_MODEL:-}"
CLI_PROVIDER="${CLI_MODEL%%/*}"
CLI_DISABLE_MCP_CONFIG="${OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG:-}"
CLI_AUTH_MODE="${OPENCLAW_LIVE_CLI_BACKEND_AUTH:-auto}"
CLI_SETUP_TIMEOUT_SECONDS="${OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS:-180}"
TEMP_DIRS=()
DOCKER_USER="${OPENCLAW_DOCKER_USER:-node}"
DOCKER_HOME_MOUNT=()
@@ -236,6 +237,9 @@ export npm_config_cache="$NPM_CONFIG_CACHE"
mkdir -p "$NPM_CONFIG_PREFIX" "$XDG_CACHE_HOME" "$COREPACK_HOME" "$NPM_CONFIG_CACHE"
chmod 700 "$XDG_CACHE_HOME" "$COREPACK_HOME" "$NPM_CONFIG_CACHE" || true
export PATH="$NPM_CONFIG_PREFIX/bin:$PATH"
run_setup_command() {
timeout --foreground "${OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS:-180}s" "$@"
}
if [ "${OPENCLAW_DOCKER_AUTH_PRESTAGED:-0}" != "1" ]; then
IFS=',' read -r -a auth_dirs <<<"${OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED:-}"
IFS=',' read -r -a auth_files <<<"${OPENCLAW_DOCKER_AUTH_FILES_RESOLVED:-}"
@@ -285,9 +289,9 @@ package_has_explicit_version() {
esac
}
if [ -n "${OPENCLAW_LIVE_CLI_BACKEND_COMMAND:-}" ] && [ ! -x "${OPENCLAW_LIVE_CLI_BACKEND_COMMAND}" ] && [ -n "$docker_package" ]; then
npm install -g "$docker_package"
run_setup_command npm install -g "$docker_package"
elif [ -n "$docker_package" ] && package_has_explicit_version "$docker_package"; then
npm install -g "$docker_package"
run_setup_command npm install -g "$docker_package"
fi
if [ "$provider" = "codex-cli" ] && [ "${OPENCLAW_LIVE_CLI_BACKEND_AUTH:-auto}" = "api-key" ]; then
codex_login_command="${OPENCLAW_LIVE_CLI_BACKEND_COMMAND:-$NPM_CONFIG_PREFIX/bin/codex}"
@@ -397,6 +401,7 @@ echo "==> Run CLI backend live test in Docker"
echo "==> Model: $CLI_MODEL"
echo "==> Provider: $CLI_PROVIDER"
echo "==> Auth mode: $CLI_AUTH_MODE"
echo "==> Setup timeout: ${CLI_SETUP_TIMEOUT_SECONDS}s"
echo "==> Profile file: $PROFILE_STATUS"
if [[ "$CLI_PROVIDER" == "codex-cli" ]]; then
echo "==> CI-safe Codex config: $CLI_USE_CI_SAFE_CODEX_CONFIG"
@@ -449,6 +454,7 @@ DOCKER_RUN_ARGS=(docker run --rm -t \
-e OPENCLAW_DOCKER_AUTH_FILES_RESOLVED="$AUTH_FILES_CSV" \
-e OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE="${OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE:-copy}" \
-e OPENCLAW_LIVE_CLI_BACKEND_USE_CI_SAFE_CODEX_CONFIG="$CLI_USE_CI_SAFE_CODEX_CONFIG" \
-e OPENCLAW_LIVE_CLI_BACKEND_SETUP_TIMEOUT_SECONDS="$CLI_SETUP_TIMEOUT_SECONDS" \
-e OPENCLAW_DOCKER_CLI_BACKEND_PROVIDER="$CLI_PROVIDER" \
-e OPENCLAW_DOCKER_CLI_BACKEND_COMMAND_DEFAULT="$CLI_DEFAULT_COMMAND" \
-e OPENCLAW_DOCKER_CLI_BACKEND_NPM_PACKAGE="$CLI_DOCKER_NPM_PACKAGE" \