From e648f38efcd033a1953763e9de99db66592e4d4a Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 30 Apr 2026 04:56:06 +0100 Subject: [PATCH] fix: stabilize Parallels update restart checks --- scripts/e2e/parallels/npm-update-scripts.ts | 51 ++++- scripts/e2e/parallels/npm-update-smoke.ts | 234 ++++++++++++++++---- src/cli/daemon-cli/restart-health.test.ts | 29 +++ src/cli/daemon-cli/restart-health.ts | 8 +- 4 files changed, 270 insertions(+), 52 deletions(-) diff --git a/scripts/e2e/parallels/npm-update-scripts.ts b/scripts/e2e/parallels/npm-update-scripts.ts index c1fdfa1026c..0be413f9ca3 100644 --- a/scripts/e2e/parallels/npm-update-scripts.ts +++ b/scripts/e2e/parallels/npm-update-scripts.ts @@ -39,12 +39,34 @@ stop_openclaw_gateway_processes() { OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 /opt/homebrew/bin/openclaw gateway stop || true pkill -f 'openclaw.*gateway' >/dev/null 2>&1 || true } +start_openclaw_gateway() { + if /opt/homebrew/bin/openclaw gateway restart; then + return + fi + pkill -f 'openclaw.*gateway' >/dev/null 2>&1 || true + rm -f /tmp/openclaw-parallels-macos-gateway.log + nohup env OPENCLAW_HOME="$HOME" OPENCLAW_STATE_DIR="$HOME/.openclaw" OPENCLAW_CONFIG_PATH="$HOME/.openclaw/openclaw.json" ${input.auth.apiKeyEnv}=${shellQuote( + input.auth.apiKeyValue, + )} /opt/homebrew/bin/openclaw gateway run --bind loopback --port 18789 --force >/tmp/openclaw-parallels-macos-gateway.log 2>&1 &2 || true + echo "gateway did not become ready after update" >&2 + exit 1 +} scrub_future_plugin_entries stop_openclaw_gateway_processes OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 /opt/homebrew/bin/openclaw update --tag ${shellQuote(input.updateTarget)} --yes --json ${posixVersionCheck("/opt/homebrew/bin/openclaw", input.expectedNeedle)} -/opt/homebrew/bin/openclaw gateway restart -/opt/homebrew/bin/openclaw gateway status --deep --require-rpc +start_openclaw_gateway +wait_for_gateway /opt/homebrew/bin/openclaw models set ${shellQuote(input.auth.modelId)} /opt/homebrew/bin/openclaw config set agents.defaults.skipBootstrap true --strict-json ${posixAgentWorkspaceScript("Parallels npm update smoke test assistant.")} @@ -122,12 +144,33 @@ stop_openclaw_gateway_processes() { OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 openclaw gateway stop || true pkill -f 'openclaw.*gateway' >/dev/null 2>&1 || true } +start_openclaw_gateway() { + pkill -f "openclaw gateway run" >/dev/null 2>&1 || true + rm -f /tmp/openclaw-parallels-linux-gateway.log + setsid sh -lc ${shellQuote( + `exec env OPENCLAW_HOME=/root OPENCLAW_STATE_DIR=/root/.openclaw OPENCLAW_CONFIG_PATH=/root/.openclaw/openclaw.json OPENCLAW_DISABLE_BONJOUR=1 ${input.auth.apiKeyEnv}=${shellQuote( + input.auth.apiKeyValue, + )} openclaw gateway run --bind loopback --port 18789 --force >/tmp/openclaw-parallels-linux-gateway.log 2>&1`, + )} >/dev/null 2>&1 < /dev/null & +} +wait_for_gateway() { + deadline=$((SECONDS + 240)) + while [ "$SECONDS" -lt "$deadline" ]; do + if openclaw gateway status --deep --require-rpc --timeout 15000; then + return + fi + sleep 2 + done + cat /tmp/openclaw-parallels-linux-gateway.log >&2 || true + echo "gateway did not become ready after update" >&2 + exit 1 +} scrub_future_plugin_entries stop_openclaw_gateway_processes OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 openclaw update --tag ${shellQuote(input.updateTarget)} --yes --json ${posixVersionCheck("openclaw", input.expectedNeedle)} -openclaw gateway restart -openclaw gateway status --deep --require-rpc +start_openclaw_gateway +wait_for_gateway openclaw models set ${shellQuote(input.auth.modelId)} openclaw config set agents.defaults.skipBootstrap true --strict-json ${posixAgentWorkspaceScript("Parallels npm update smoke test assistant.")} diff --git a/scripts/e2e/parallels/npm-update-smoke.ts b/scripts/e2e/parallels/npm-update-smoke.ts index 7541ffc2712..c49c81a8159 100755 --- a/scripts/e2e/parallels/npm-update-smoke.ts +++ b/scripts/e2e/parallels/npm-update-smoke.ts @@ -14,7 +14,6 @@ import { resolveLatestVersion, resolveProviderAuth, run, - runStreaming, say, startHostServer, writeJson, @@ -45,6 +44,11 @@ interface Job { promise: Promise; } +interface UpdateJobContext { + append(chunk: string | Uint8Array): void; + logPath: string; +} + interface NpmUpdateSummary { packageSpec: string; updateTarget: string; @@ -296,15 +300,15 @@ class NpmUpdateSmoke { const jobs: Job[] = []; if (this.options.platforms.has("macos")) { ensureVmRunning(macosVm); - jobs.push(this.spawnUpdate("macOS", "macos", () => this.runMacosUpdate())); + jobs.push(this.spawnUpdate("macOS", "macos", (ctx) => this.runMacosUpdate(ctx))); } if (this.options.platforms.has("windows")) { ensureVmRunning(windowsVm); - jobs.push(this.spawnUpdate("Windows", "windows", () => this.runWindowsUpdate())); + jobs.push(this.spawnUpdate("Windows", "windows", (ctx) => this.runWindowsUpdate(ctx))); } if (this.options.platforms.has("linux")) { ensureVmRunning(this.linuxVm); - jobs.push(this.spawnUpdate("Linux", "linux", () => this.runLinuxUpdate())); + jobs.push(this.spawnUpdate("Linux", "linux", (ctx) => this.runLinuxUpdate(ctx))); } await this.monitorJobs("update", jobs); for (const job of jobs) { @@ -319,7 +323,11 @@ class NpmUpdateSmoke { } } - private spawnUpdate(label: string, platform: Platform, fn: () => Promise | void): Job { + private spawnUpdate( + label: string, + platform: Platform, + fn: (ctx: UpdateJobContext) => Promise | void, + ): Job { const logPath = path.join(this.runDir, `${platform}-update.log`); const job: Job = { done: false, @@ -328,8 +336,6 @@ class NpmUpdateSmoke { promise: Promise.resolve(1), }; job.promise = (async () => { - const originalStdout = process.stdout.write.bind(process.stdout); - const originalStderr = process.stderr.write.bind(process.stderr); let log = ""; const append = (chunk: string | Uint8Array): boolean => { const text = typeof chunk === "string" ? chunk : Buffer.from(chunk).toString("utf8"); @@ -340,11 +346,7 @@ class NpmUpdateSmoke { append(`${label} update timed out after ${updateTimeoutSeconds}s\n`); }, updateTimeoutSeconds * 1000); try { - process.stdout.write = ((chunk: string | Uint8Array) => - append(chunk)) as typeof process.stdout.write; - process.stderr.write = ((chunk: string | Uint8Array) => - append(chunk)) as typeof process.stderr.write; - await fn(); + await fn({ append, logPath }); await writeFile(logPath, log, "utf8"); return 0; } catch (error) { @@ -353,8 +355,6 @@ class NpmUpdateSmoke { return 1; } finally { clearTimeout(timeout); - process.stdout.write = originalStdout; - process.stderr.write = originalStderr; } })().finally(() => { job.done = true; @@ -362,16 +362,16 @@ class NpmUpdateSmoke { return job; } - private runMacosUpdate(): void { - this.guestMacos(this.updateScript("macos"), updateTimeoutSeconds * 1000); + private async runMacosUpdate(ctx: UpdateJobContext): Promise { + await this.guestMacos(this.updateScript("macos"), updateTimeoutSeconds * 1000, ctx); } - private runWindowsUpdate(): Promise { - return this.guestWindows(this.updateScript("windows"), updateTimeoutSeconds * 1000); + private runWindowsUpdate(ctx: UpdateJobContext): Promise { + return this.guestWindows(this.updateScript("windows"), updateTimeoutSeconds * 1000, ctx); } - private runLinuxUpdate(): void { - this.guestLinux(this.updateScript("linux"), updateTimeoutSeconds * 1000); + private async runLinuxUpdate(ctx: UpdateJobContext): Promise { + await this.guestLinux(this.updateScript("linux"), updateTimeoutSeconds * 1000, ctx); } private updateScript(platform: Platform): string { @@ -436,24 +436,112 @@ class NpmUpdateSmoke { } } - private guestMacos(script: string, timeoutMs: number): void { - run( + private async guestMacos( + script: string, + timeoutMs: number, + ctx: UpdateJobContext, + ): Promise { + const macosExecArgs = this.resolveMacosUpdateExecArgs(ctx); + const status = await this.runStreamingToJobLog( "prlctl", - [ - "exec", - macosVm, - "--current-user", - "/usr/bin/env", - "PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin", - "/bin/bash", - "-lc", - script, - ], - { timeoutMs }, + ["exec", macosVm, ...macosExecArgs, "/bin/bash", "-lc", script], + timeoutMs, + ctx, ); + if (status !== 0) { + throw new Error(`macOS update command failed with exit code ${status}`); + } } - private async guestWindows(script: string, timeoutMs: number): Promise { + private resolveMacosUpdateExecArgs(ctx: UpdateJobContext): string[] { + const guestPath = + "/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin"; + const currentUser = run("prlctl", ["exec", macosVm, "--current-user", "whoami"], { + check: false, + quiet: true, + timeoutMs: 45_000, + }); + const user = currentUser.stdout.trim().replaceAll("\r", "").split("\n").at(-1) ?? ""; + if (currentUser.status === 0 && /^[A-Za-z0-9._-]+$/.test(user)) { + return ["--current-user", "/usr/bin/env", `PATH=${guestPath}`]; + } + + const fallbackUser = this.resolveMacosDesktopUser(); + if (!fallbackUser) { + ctx.append(currentUser.stdout); + ctx.append(currentUser.stderr); + throw new Error("macOS desktop user unavailable before update phase"); + } + ctx.append( + `desktop user unavailable via Parallels --current-user; using root sudo fallback for ${fallbackUser}\n`, + ); + const home = this.resolveMacosDesktopHome(fallbackUser); + return [ + "/usr/bin/sudo", + "-H", + "-u", + fallbackUser, + "/usr/bin/env", + `HOME=${home}`, + `USER=${fallbackUser}`, + `LOGNAME=${fallbackUser}`, + `PATH=${guestPath}`, + ]; + } + + private resolveMacosDesktopUser(): string { + const consoleUser = + run("prlctl", ["exec", macosVm, "/usr/bin/stat", "-f", "%Su", "/dev/console"], { + check: false, + quiet: true, + timeoutMs: 30_000, + }) + .stdout.trim() + .replaceAll("\r", "") + .split("\n") + .at(-1) ?? ""; + if ( + /^[A-Za-z0-9._-]+$/.test(consoleUser) && + consoleUser !== "root" && + consoleUser !== "loginwindow" + ) { + return consoleUser; + } + const users = run( + "prlctl", + ["exec", macosVm, "/usr/bin/dscl", ".", "-list", "/Users", "NFSHomeDirectory"], + { check: false, quiet: true, timeoutMs: 30_000 }, + ).stdout.replaceAll("\r", ""); + for (const line of users.split("\n")) { + const [user, home] = line.trim().split(/\s+/); + if ( + user && + home?.startsWith("/Users/") && + !user.startsWith("_") && + user !== "Shared" && + user !== ".localized" + ) { + return user; + } + } + return ""; + } + + private resolveMacosDesktopHome(user: string): string { + const output = run( + "prlctl", + ["exec", macosVm, "/usr/bin/dscl", ".", "-read", `/Users/${user}`, "NFSHomeDirectory"], + { check: false, quiet: true, timeoutMs: 30_000 }, + ).stdout.replaceAll("\r", ""); + const match = /NFSHomeDirectory:\s*(\S+)/.exec(output); + return match?.[1] ?? `/Users/${user}`; + } + + private async guestWindows( + script: string, + timeoutMs: number, + ctx: UpdateJobContext, + ): Promise { const fileBase = `openclaw-parallels-npm-update-windows-${process.pid}-${Date.now()}`; const pathsScript = `$base = Join-Path $env:TEMP '${fileBase}' $scriptPath = "$base.ps1" @@ -474,7 +562,7 @@ ${script} } finally { Set-Content -Path $donePath -Value 'done' -Encoding UTF8 }`; - run( + const writeScript = run( "prlctl", [ "exec", @@ -490,11 +578,21 @@ Remove-Item -Path $scriptPath, $logPath, $donePath, $exitPath -Force -ErrorActio [System.IO.File]::WriteAllText($scriptPath, [Console]::In.ReadToEnd(), [System.Text.UTF8Encoding]::new($false)) if (!(Test-Path $scriptPath)) { throw "background update script was not written" }`), ], - { input: payload, timeoutMs: Math.min(timeoutMs, 120_000) }, + { check: false, input: payload, timeoutMs: Math.min(timeoutMs, 120_000) }, ); + if (writeScript.stdout) { + ctx.append(writeScript.stdout); + } + if (writeScript.stderr) { + ctx.append(writeScript.stderr); + } + if (writeScript.status !== 0) { + throw new Error( + `Windows update background script write failed with exit code ${writeScript.status}`, + ); + } - const launchLogPath = path.join(this.runDir, `${fileBase}-launch.log`); - const launchStatus = await runStreaming( + const launchStatus = await this.runStreamingToJobLog( "prlctl", [ "exec", @@ -506,12 +604,9 @@ if (!(Test-Path $scriptPath)) { throw "background update script was not written" "/c", `start "" /min powershell.exe -NoProfile -WindowStyle Hidden -ExecutionPolicy Bypass -File "%TEMP%\\${fileBase}.ps1"`, ], - { logPath: launchLogPath, quiet: true, timeoutMs: 20_000 }, + 20_000, + ctx, ); - const launchLog = await readFile(launchLogPath, "utf8").catch(() => ""); - if (launchLog) { - process.stdout.write(launchLog); - } if (launchStatus !== 0 && launchStatus !== 124) { throw new Error(`Windows update background launch failed with exit code ${launchStatus}`); } @@ -550,10 +645,10 @@ if (Test-Path $donePath) { { check: false, timeoutMs: Math.min(30_000, Math.max(1_000, deadline - Date.now())) }, ); if (poll.stdout) { - process.stdout.write(poll.stdout); + ctx.append(poll.stdout); } if (poll.stderr) { - process.stderr.write(poll.stderr); + ctx.append(poll.stderr); } const offsetMatch = poll.stdout.match(/__OPENCLAW_LOG_OFFSET__:(\d+)/); if (offsetMatch) { @@ -588,9 +683,54 @@ Remove-Item -Path $scriptPath, $logPath, $donePath, $exitPath -Force -ErrorActio throw new Error(`Windows update timed out after ${updateTimeoutSeconds}s`); } - private guestLinux(script: string, timeoutMs: number): void { - run("prlctl", ["exec", this.linuxVm, "/usr/bin/env", "HOME=/root", "bash", "-lc", script], { + private async guestLinux( + script: string, + timeoutMs: number, + ctx: UpdateJobContext, + ): Promise { + const status = await this.runStreamingToJobLog( + "prlctl", + ["exec", this.linuxVm, "/usr/bin/env", "HOME=/root", "bash", "-lc", script], timeoutMs, + ctx, + ); + if (status !== 0) { + throw new Error(`Linux update command failed with exit code ${status}`); + } + } + + private async runStreamingToJobLog( + command: string, + args: string[], + timeoutMs: number, + ctx: UpdateJobContext, + ): Promise { + return await new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd: repoRoot, + env: process.env, + stdio: ["ignore", "pipe", "pipe"], + }); + + child.stdout.on("data", (chunk: Buffer) => ctx.append(chunk)); + child.stderr.on("data", (chunk: Buffer) => ctx.append(chunk)); + + let timedOut = false; + const timer = setTimeout(() => { + timedOut = true; + child.kill("SIGTERM"); + setTimeout(() => child.kill("SIGKILL"), 2_000).unref(); + }, timeoutMs); + + child.on("error", reject); + child.on("close", (code, signal) => { + clearTimeout(timer); + if (timedOut) { + resolve(124); + return; + } + resolve(code ?? (signal ? 128 : 1)); + }); }); } diff --git a/src/cli/daemon-cli/restart-health.test.ts b/src/cli/daemon-cli/restart-health.test.ts index 000a33686b6..2b491882aa0 100644 --- a/src/cli/daemon-cli/restart-health.test.ts +++ b/src/cli/daemon-cli/restart-health.test.ts @@ -351,6 +351,35 @@ describe("inspectGatewayRestart", () => { expect(snapshot.versionMismatch).toBeUndefined(); }); + it("accepts matching-version restart liveness when the probe lacks operator scope", async () => { + probeGateway.mockResolvedValue({ + ok: false, + close: null, + connectLatencyMs: 12, + error: "missing scope: operator.read", + auth: { capability: "connected_no_operator_scope" }, + server: { version: "2026.4.24", connId: "new" }, + }); + + const snapshot = await inspectGatewayRestartWithSnapshot({ + runtime: { status: "running", pid: 8000 }, + expectedVersion: "2026.4.24", + portUsage: { + port: 18789, + status: "busy", + listeners: [{ pid: 8000, commandLine: "openclaw-gateway" }], + hints: [], + }, + }); + + expect(snapshot).toMatchObject({ + healthy: true, + gatewayVersion: "2026.4.24", + expectedVersion: "2026.4.24", + }); + expect(snapshot.versionMismatch).toBeUndefined(); + }); + it("stops waiting once the restarted gateway reports the wrong version", async () => { probeGateway.mockResolvedValue({ ok: true, diff --git a/src/cli/daemon-cli/restart-health.ts b/src/cli/daemon-cli/restart-health.ts index 6695d1e96c3..4f122c0a23c 100644 --- a/src/cli/daemon-cli/restart-health.ts +++ b/src/cli/daemon-cli/restart-health.ts @@ -237,8 +237,14 @@ async function confirmGatewayReachable(params: { timeoutMs: 3_000, includeDetails: params.includeHealthDetails === true, }); + const reachedGateway = + probe.ok || + looksLikeAuthClose(probe.close?.code, probe.close?.reason) || + (probe.connectLatencyMs != null && + probe.server?.version != null && + probe.auth.capability === "connected_no_operator_scope"); return { - reachable: probe.ok || looksLikeAuthClose(probe.close?.code, probe.close?.reason), + reachable: reachedGateway, gatewayVersion: probe.server?.version ?? null, activatedPluginErrors: readActivatedPluginErrors(probe.health), channelProbeErrors: readChannelProbeErrors(probe.health),