fix(parallels): harden npm update smoke

This commit is contained in:
Peter Steinberger
2026-04-25 00:49:52 +01:00
parent c12af80b5b
commit a1087ea7a6
3 changed files with 317 additions and 48 deletions

View File

@@ -380,16 +380,25 @@ source_tree_dirty_for_build() {
[[ -n "$(git status --porcelain -- src ui packages extensions package.json pnpm-lock.yaml 'tsconfig*.json' 2>/dev/null)" ]]
}
current_build_has_control_ui() {
[[ -f dist/control-ui/index.html ]] || return 1
compgen -G "dist/control-ui/assets/*" >/dev/null
}
ensure_current_build() {
local build_commit head rc
head="$(git rev-parse HEAD)"
build_commit="$(current_build_commit)"
if [[ "$build_commit" == "$head" ]] && ! source_tree_dirty_for_build; then
if [[ "$build_commit" == "$head" ]] && ! source_tree_dirty_for_build && current_build_has_control_ui; then
return 0
fi
say "Build dist for current head"
pnpm build
rc=$?
if [[ $rc -eq 0 ]]; then
pnpm ui:build
rc=$?
fi
if [[ $rc -eq 0 ]]; then
parallels_package_assert_no_generated_drift
rc=$?
@@ -525,6 +534,30 @@ function Invoke-CaptureLogged {
return ($output | Out-String).Trim()
}
function Test-GatewayListenerReady {
$listeners = Get-NetTCPConnection -LocalPort 18789 -State Listen -ErrorAction SilentlyContinue
return [bool]$listeners
}
function Test-GatewayLogReady {
$logDir = Join-Path $env:LOCALAPPDATA 'Temp\openclaw'
if (-not (Test-Path $logDir)) {
return $false
}
$logFile = Get-ChildItem -Path $logDir -Filter 'openclaw-*.log' -File -ErrorAction SilentlyContinue |
Sort-Object LastWriteTime -Descending |
Select-Object -First 1
if (-not $logFile) {
return $false
}
try {
$tail = Get-Content -Path $logFile.FullName -Tail 120 -ErrorAction Stop | Out-String
} catch {
return $false
}
return $tail -match '"ready \('
}
function Wait-GatewayRpcReady {
param(
[Parameter(Mandatory = $true)][string]$OpenClawPath,
@@ -534,11 +567,17 @@ function Wait-GatewayRpcReady {
for ($attempt = 1; $attempt -le $Attempts; $attempt++) {
Write-ProgressLog "update.gateway-status.attempt-$attempt"
if ((Test-GatewayListenerReady) -and (Test-GatewayLogReady)) {
Write-ProgressLog "update.gateway-status.ready-log-$attempt"
return $true
}
try {
$statusOutput = Invoke-CaptureLogged 'openclaw gateway status' { & $OpenClawPath gateway status --deep --require-rpc }
if ($statusOutput -match 'Read probe:\s*failed') {
throw 'gateway status returned without RPC read readiness'
$probeOutput = Invoke-CaptureLogged 'openclaw gateway probe' { & $OpenClawPath gateway probe --url ws://127.0.0.1:18789 --timeout 5000 --json }
$probe = $probeOutput | ConvertFrom-Json
if (-not $probe.ok) {
throw 'gateway probe returned without RPC readiness'
}
Invoke-CaptureLogged 'openclaw gateway status' { & $OpenClawPath gateway status --deep --require-rpc } | Out-Null
return $true
} catch {
if ($attempt -ge $Attempts) {
@@ -674,12 +713,11 @@ function Invoke-OpenClawUpdateWithTimeout {
param(
[Parameter(Mandatory = $true)][string]$OpenClawPath,
[Parameter(Mandatory = $true)][string]$UpdateTarget,
[int]$TimeoutSeconds = 600
[int]$TimeoutSeconds = 1200
)
$updateJob = Start-Job -ScriptBlock {
param([string]$Path, [string]$Target)
$env:OPENCLAW_DISABLE_BUNDLED_PLUGINS = '1'
$output = & $Path update --tag $Target --yes --json *>&1
[pscustomobject]@{
ExitCode = $LASTEXITCODE
@@ -707,6 +745,65 @@ function Invoke-OpenClawUpdateWithTimeout {
Stop-OpenClawUpdateProcesses
}
function Invoke-OpenClawAgentWithTimeout {
param(
[Parameter(Mandatory = $true)][string]$OpenClawPath,
[Parameter(Mandatory = $true)][string]$SessionId,
[int]$TimeoutSeconds = 600
)
$message = 'Reply with exact ASCII text OK only.'
$stdout = Join-Path $env:TEMP ("openclaw-parallels-agent-{0}.out.log" -f ([guid]::NewGuid().ToString('N')))
$stderr = Join-Path $env:TEMP ("openclaw-parallels-agent-{0}.err.log" -f ([guid]::NewGuid().ToString('N')))
$agentJob = Start-Job -ScriptBlock {
param([string]$Path, [string]$AgentSessionId, [string]$AgentMessage, [string]$StdoutPath, [string]$StderrPath)
& $Path agent --local --agent main --session-id $AgentSessionId --message $AgentMessage --json > $StdoutPath 2> $StderrPath
exit $LASTEXITCODE
} -ArgumentList $OpenClawPath, $SessionId, $message, $stdout, $stderr
$deadline = (Get-Date).AddSeconds($TimeoutSeconds)
$combined = ''
while ((Get-Date) -lt $deadline) {
Start-Sleep -Seconds 2
$out = ''
$err = ''
if (Test-Path $stdout) {
$out = Get-Content -Path $stdout -Raw -ErrorAction SilentlyContinue
}
if (Test-Path $stderr) {
$err = Get-Content -Path $stderr -Raw -ErrorAction SilentlyContinue
}
$combined = "$out`n$err"
if ($combined -match '"finalAssistantRawText":\s*"OK"' -or $combined -match '"finalAssistantVisibleText":\s*"OK"') {
if ($combined.Trim().Length -gt 0) {
$combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null
}
Stop-Job $agentJob -ErrorAction SilentlyContinue
Remove-Job $agentJob -Force -ErrorAction SilentlyContinue
return 0
}
if ($agentJob.State -in @('Completed', 'Failed', 'Stopped')) {
if ($combined.Trim().Length -gt 0) {
$combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null
}
Receive-Job $agentJob -ErrorAction SilentlyContinue | Out-Null
$jobState = $agentJob.State
Remove-Job $agentJob -Force -ErrorAction SilentlyContinue
if ($jobState -ne 'Completed') {
throw "openclaw agent failed with job state $jobState"
}
throw 'openclaw agent finished without OK response'
}
}
Stop-Job $agentJob -ErrorAction SilentlyContinue
Remove-Job $agentJob -Force -ErrorAction SilentlyContinue
Write-ProgressLog 'update.agent-turn.timeout'
if ($combined.Trim().Length -gt 0) {
$combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null
}
throw "openclaw agent timed out after ${TimeoutSeconds}s"
}
function Start-GatewayRunFallback {
param(
[Parameter(Mandatory = $true)][string]$OpenClawPath
@@ -836,13 +933,10 @@ try {
# an explicit start only if the RPC endpoint never returns.
Write-ProgressLog 'update.restart-gateway'
Restart-GatewayWithRecovery -OpenClawPath $openclaw
Stop-OpenClawGatewayProcesses
Complete-WorkspaceSetup
Write-ProgressLog 'update.agent-turn'
Invoke-CaptureLogged 'openclaw agent' { & $openclaw agent --local --agent main --session-id $SessionId --message 'Reply with exact ASCII text OK only.' --json } | Out-Null
$exitCode = $LASTEXITCODE
if ($null -eq $exitCode) {
$exitCode = 0
}
$exitCode = Invoke-OpenClawAgentWithTimeout -OpenClawPath $openclaw -SessionId $SessionId
Write-ProgressLog 'update.done'
Set-Content -Path $DonePath -Value ([string]$exitCode)
exit $exitCode
@@ -921,7 +1015,19 @@ verify_macos_update_after_transport_loss() {
cat <<EOF | prlctl exec "$MACOS_VM" /usr/bin/tee "$script_path" >/dev/null
set -euo pipefail
export PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin
export OPENCLAW_PLUGIN_STAGE_DIR="\$HOME/.openclaw/plugin-runtime-deps-parallels"
busy="\$(/bin/ps -axo command | /usr/bin/egrep 'openclaw update|npm install|pnpm install|pnpm run build' | /usr/bin/egrep -v 'egrep|openclaw-npm-update-macos-recover' || true)"
gateway_listener_ready() {
/usr/sbin/lsof -tiTCP:18789 -sTCP:LISTEN >/dev/null 2>&1
}
gateway_log_ready() {
latest="\$(/bin/ls -t /tmp/openclaw/openclaw-*.log 2>/dev/null | /usr/bin/head -n 1 || true)"
[ -n "\$latest" ] || return 1
/usr/bin/tail -n 160 "\$latest" | /usr/bin/grep -q 'ready ('
}
gateway_smoke_ready() {
gateway_listener_ready && gateway_log_ready
}
if [ -n "\$busy" ]; then
printf 'update still has active npm/pnpm/openclaw processes\n%s\n' "\$busy" >&2
exit 1
@@ -937,10 +1043,10 @@ if [ -n "$expected_needle" ]; then
;;
esac
fi
/opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1 || /opt/homebrew/bin/openclaw gateway restart || true
gateway_smoke_ready || /opt/homebrew/bin/openclaw gateway restart || true
gateway_ready=0
for _ in 1 2 3 4 5 6; do
if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc; then
if gateway_smoke_ready; then
gateway_ready=1
break
fi
@@ -949,7 +1055,7 @@ done
if [ "\$gateway_ready" != "1" ]; then
/opt/homebrew/bin/openclaw gateway start || true
for _ in 1 2 3 4 5 6; do
if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc; then
if gateway_smoke_ready; then
gateway_ready=1
break
fi
@@ -957,7 +1063,7 @@ if [ "\$gateway_ready" != "1" ]; then
done
fi
if [ "\$gateway_ready" != "1" ]; then
echo "gateway did not become RPC-ready after transport recovery" >&2
echo "gateway did not become ready after transport recovery" >&2
exit 1
fi
workspace="\${OPENCLAW_WORKSPACE_DIR:-\$HOME/.openclaw/workspace}"
@@ -993,7 +1099,7 @@ print(base64.b64encode(os.environ["PROVIDER_KEY"].encode("utf-8")).decode("ascii
PY
)"
set +e
guest_powershell_poll 120 "$(cat <<EOF
guest_powershell_poll 720 "$(cat <<EOF
\$ErrorActionPreference = 'Stop'
\$openclaw = Join-Path \$env:APPDATA 'npm\\openclaw.cmd'
if (-not (Test-Path \$openclaw)) {
@@ -1126,7 +1232,59 @@ New-Item -ItemType Directory -Path \$stateDir -Force | Out-Null
}
'@ | Set-Content -Path (Join-Path \$stateDir 'workspace-state.json') -Encoding UTF8
Remove-Item (Join-Path \$workspace 'BOOTSTRAP.md') -Force -ErrorAction SilentlyContinue
& \$openclaw agent --local --agent main --session-id 'parallels-npm-update-windows-transport-recovery-$expected_needle' --message 'Reply with exact ASCII text OK only.' --json
Stop-GatewayListeners
\$agentStdout = Join-Path \$env:TEMP ("openclaw-parallels-agent-{0}.out.log" -f ([guid]::NewGuid().ToString('N')))
\$agentStderr = Join-Path \$env:TEMP ("openclaw-parallels-agent-{0}.err.log" -f ([guid]::NewGuid().ToString('N')))
\$agentJob = Start-Job -ScriptBlock {
param([string]\$Path, [string]\$StdoutPath, [string]\$StderrPath)
& \$Path agent --local --agent main --session-id 'parallels-npm-update-windows-transport-recovery-$expected_needle' --message 'Reply with exact ASCII text OK only.' --json > \$StdoutPath 2> \$StderrPath
exit \$LASTEXITCODE
} -ArgumentList \$openclaw, \$agentStdout, \$agentStderr
\$agentDeadline = (Get-Date).AddSeconds(600)
\$agentCombined = ''
while ((Get-Date) -lt \$agentDeadline) {
Start-Sleep -Seconds 2
\$agentOut = ''
\$agentErr = ''
if (Test-Path \$agentStdout) {
\$agentOut = Get-Content -Path \$agentStdout -Raw -ErrorAction SilentlyContinue
}
if (Test-Path \$agentStderr) {
\$agentErr = Get-Content -Path \$agentStderr -Raw -ErrorAction SilentlyContinue
}
\$agentCombined = \$agentOut + [Environment]::NewLine + \$agentErr
if (\$agentCombined -match '"finalAssistantRawText":\s*"OK"' -or \$agentCombined -match '"finalAssistantVisibleText":\s*"OK"') {
if (\$agentCombined.Trim().Length -gt 0) {
\$agentCombined.Trim() | Write-Output
}
Stop-Job \$agentJob -ErrorAction SilentlyContinue
Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue
\$agentJob = \$null
break
}
if (\$agentJob.State -in @('Completed', 'Failed', 'Stopped')) {
if (\$agentCombined.Trim().Length -gt 0) {
\$agentCombined.Trim() | Write-Output
}
Receive-Job \$agentJob -ErrorAction SilentlyContinue | Out-Null
\$agentJobState = \$agentJob.State
Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue
\$agentJob = \$null
if (\$agentJobState -ne 'Completed') {
throw "openclaw agent failed with job state \$agentJobState"
}
throw 'openclaw agent finished without OK response'
break
}
}
if (\$null -ne \$agentJob) {
Stop-Job \$agentJob -ErrorAction SilentlyContinue
Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue
if (\$agentCombined.Trim().Length -gt 0) {
\$agentCombined.Trim() | Write-Output
}
throw 'openclaw agent timed out after 600s'
}
EOF
)"
local rc=$?
@@ -1146,14 +1304,24 @@ start_timeout_guard() {
if [[ -n "$log_path" ]]; then
dump_log_tail "$label" "$log_path"
fi
kill "$pid" >/dev/null 2>&1 || true
terminate_process_tree "$pid" TERM
sleep 2
kill -9 "$pid" >/dev/null 2>&1 || true
terminate_process_tree "$pid" KILL
fi
) >&2 &
printf '%s\n' "$!"
}
terminate_process_tree() {
local pid="$1"
local signal_name="${2:-TERM}"
local child
pgrep -P "$pid" 2>/dev/null | while read -r child; do
terminate_process_tree "$child" "$signal_name"
done
kill "-$signal_name" "$pid" >/dev/null 2>&1 || true
}
stop_timeout_guard() {
local pid="${1:-}"
[[ -n "$pid" ]] || return 0
@@ -1210,27 +1378,50 @@ host_timeout_exec() {
shift
HOST_TIMEOUT_S="$timeout_s" "$PYTHON_BIN" - "$@" <<'PY'
import os
import signal
import subprocess
import sys
timeout = int(os.environ["HOST_TIMEOUT_S"])
args = sys.argv[1:]
process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
start_new_session=True,
)
try:
completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
except subprocess.TimeoutExpired as exc:
if exc.stdout:
sys.stdout.buffer.write(exc.stdout)
if exc.stderr:
sys.stderr.buffer.write(exc.stderr)
stdout, stderr = process.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
try:
os.killpg(process.pid, signal.SIGTERM)
except ProcessLookupError:
pass
except PermissionError:
pass
try:
stdout, stderr = process.communicate(timeout=2)
except subprocess.TimeoutExpired:
try:
os.killpg(process.pid, signal.SIGKILL)
except ProcessLookupError:
pass
except PermissionError:
pass
stdout, stderr = process.communicate()
if stdout:
sys.stdout.buffer.write(stdout)
if stderr:
sys.stderr.buffer.write(stderr)
sys.stderr.write(f"host timeout after {timeout}s\n")
raise SystemExit(124)
if completed.stdout:
sys.stdout.buffer.write(completed.stdout)
if completed.stderr:
sys.stderr.buffer.write(completed.stderr)
raise SystemExit(completed.returncode)
if stdout:
sys.stdout.buffer.write(stdout)
if stderr:
sys.stderr.buffer.write(stderr)
raise SystemExit(process.returncode)
PY
}
@@ -1401,11 +1592,23 @@ run_macos_update() {
set -euo pipefail
export PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin
if [ -z "\${HOME:-}" ]; then export HOME="/Users/\$(id -un)"; fi
export OPENCLAW_PLUGIN_STAGE_DIR="\$HOME/.openclaw/plugin-runtime-deps-parallels"
if [ -z "\${$API_KEY_ENV:-}" ]; then
echo "$API_KEY_ENV is required in the macOS update environment" >&2
exit 1
fi
cd "\$HOME"
gateway_listener_ready() {
/usr/sbin/lsof -tiTCP:18789 -sTCP:LISTEN >/dev/null 2>&1
}
gateway_log_ready() {
latest="\$(/bin/ls -t /tmp/openclaw/openclaw-*.log 2>/dev/null | /usr/bin/head -n 1 || true)"
[ -n "\$latest" ] || return 1
/usr/bin/tail -n 160 "\$latest" | /usr/bin/grep -q 'ready ('
}
gateway_smoke_ready() {
gateway_listener_ready && gateway_log_ready
}
scrub_future_plugin_entries() {
node - <<'JS' || true
const fs = require("fs");
@@ -1446,7 +1649,7 @@ stop_openclaw_gateway_processes() {
# host can observe new plugin metadata mid-update and abort config validation.
scrub_future_plugin_entries
stop_openclaw_gateway_processes
OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 /opt/homebrew/bin/openclaw update --tag "$update_target" --yes --json
/opt/homebrew/bin/openclaw update --tag "$update_target" --yes --json
# Same-guest npm upgrades can leave the old gateway process holding the old
# bundled plugin host version. Stop it before post-update config commands.
stop_openclaw_gateway_processes
@@ -1470,7 +1673,7 @@ fi
/opt/homebrew/bin/openclaw gateway restart || true
gateway_ready=0
for _ in 1 2 3 4 5 6 7 8; do
if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1; then
if gateway_smoke_ready; then
gateway_ready=1
break
fi
@@ -1480,7 +1683,7 @@ if [ "\$gateway_ready" != "1" ]; then
stop_openclaw_gateway_processes
/opt/homebrew/bin/openclaw gateway run --bind loopback --port 18789 --force >/tmp/openclaw-parallels-npm-update-macos-gateway.log 2>&1 </dev/null &
for _ in 1 2 3 4 5 6 7 8; do
if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1; then
if gateway_smoke_ready; then
gateway_ready=1
break
fi
@@ -1490,7 +1693,9 @@ fi
if [ "\$gateway_ready" != "1" ]; then
tail -n 120 /tmp/openclaw-parallels-npm-update-macos-gateway.log 2>/dev/null || true
fi
/opt/homebrew/bin/openclaw gateway status --deep --require-rpc
if [ "\$gateway_ready" != "1" ]; then
/opt/homebrew/bin/openclaw gateway status --deep --require-rpc
fi
workspace="\${OPENCLAW_WORKSPACE_DIR:-\$HOME/.openclaw/workspace}"
mkdir -p "\$workspace/.openclaw"
cat > "\$workspace/IDENTITY.md" <<'IDENTITY_EOF'
@@ -1577,7 +1782,7 @@ stop_openclaw_gateway_processes() {
# the old host can observe new plugin metadata mid-update and abort validation.
scrub_future_plugin_entries
stop_openclaw_gateway_processes
OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 openclaw update --tag "$update_target" --yes --json
openclaw update --tag "$update_target" --yes --json
# The fresh Linux lane starts a manual gateway; stop the old process before
# post-update config validation sees mixed old-host/new-plugin metadata.
stop_openclaw_gateway_processes

View File

@@ -413,27 +413,50 @@ host_timeout_exec() {
shift
HOST_TIMEOUT_S="$timeout_s" python3 - "$@" <<'PY'
import os
import signal
import subprocess
import sys
timeout = int(os.environ["HOST_TIMEOUT_S"])
args = sys.argv[1:]
process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
start_new_session=True,
)
try:
completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
except subprocess.TimeoutExpired as exc:
if exc.stdout:
sys.stdout.buffer.write(exc.stdout)
if exc.stderr:
sys.stderr.buffer.write(exc.stderr)
stdout, stderr = process.communicate(timeout=timeout)
except subprocess.TimeoutExpired:
try:
os.killpg(process.pid, signal.SIGTERM)
except ProcessLookupError:
pass
except PermissionError:
pass
try:
stdout, stderr = process.communicate(timeout=2)
except subprocess.TimeoutExpired:
try:
os.killpg(process.pid, signal.SIGKILL)
except ProcessLookupError:
pass
except PermissionError:
pass
stdout, stderr = process.communicate()
if stdout:
sys.stdout.buffer.write(stdout)
if stderr:
sys.stderr.buffer.write(stderr)
sys.stderr.write(f"host timeout after {timeout}s\n")
raise SystemExit(124)
if completed.stdout:
sys.stdout.buffer.write(completed.stdout)
if completed.stderr:
sys.stderr.buffer.write(completed.stderr)
raise SystemExit(completed.returncode)
if stdout:
sys.stdout.buffer.write(stdout)
if stderr:
sys.stderr.buffer.write(stderr)
raise SystemExit(process.returncode)
PY
}
@@ -690,6 +713,16 @@ show_log_excerpt() {
tail -n 80 "$log_path" >&2 || true
}
terminate_process_tree() {
local pid="$1"
local signal_name="${2:-TERM}"
local child
pgrep -P "$pid" 2>/dev/null | while read -r child; do
terminate_process_tree "$child" "$signal_name"
done
kill "-$signal_name" "$pid" >/dev/null 2>&1 || true
}
phase_run() {
local phase_id="$1"
local timeout_s="$2"
@@ -716,9 +749,9 @@ phase_run() {
fi
if (( SECONDS - start >= timeout_s )); then
timed_out=1
kill "$pid" >/dev/null 2>&1 || true
terminate_process_tree "$pid" TERM
sleep 2
kill -9 "$pid" >/dev/null 2>&1 || true
terminate_process_tree "$pid" KILL
break
fi
sleep 1