diff --git a/docs/help/testing.md b/docs/help/testing.md index a87127c9018..d281f1011fa 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -134,6 +134,37 @@ runs the same lanes before release approval. `openclaw update --tag `, and verifies the candidate's post-update doctor repairs bundled channel runtime dependencies without a harness-side postinstall repair. +- `pnpm test:parallels:npm-update` + - Runs the native packaged-install update smoke across Parallels guests. Each + selected platform first installs the requested baseline package, then runs + the installed `openclaw update` command in the same guest and verifies the + installed version, update status, gateway readiness, and one local agent + turn. + - Use `--platform macos`, `--platform windows`, or `--platform linux` while + iterating on one guest. Use `--json` for the summary artifact path and + per-lane status. + - Wrap long local runs in a host timeout so Parallels transport stalls cannot + consume the rest of the testing window: + + ```bash + timeout --foreground 150m pnpm test:parallels:npm-update -- --json + timeout --foreground 90m pnpm test:parallels:npm-update -- --platform windows --json + ``` + + - The script writes nested lane logs under `/tmp/openclaw-parallels-npm-update.*`. + Inspect `windows-update.log`, `macos-update.log`, or `linux-update.log` + before assuming the outer wrapper is hung. + - Windows update can spend 10 to 15 minutes in post-update doctor/runtime + dependency repair on a cold guest; that is still healthy when the nested + npm debug log is advancing. + - Do not run this aggregate wrapper in parallel with individual Parallels + macOS, Windows, or Linux smoke lanes. They share VM state and can collide on + snapshot restore, package serving, or guest gateway state. + - The post-update proof runs the normal bundled plugin surface because + capability facades such as speech, image generation, and media + understanding are loaded through bundled runtime APIs even when the agent + turn itself only checks a simple text response. + - `pnpm openclaw qa aimock` - Starts only the local AIMock provider server for direct protocol smoke testing. diff --git a/scripts/e2e/parallels-npm-update-smoke.sh b/scripts/e2e/parallels-npm-update-smoke.sh index 6e1164a71d7..d43b5d03f15 100755 --- a/scripts/e2e/parallels-npm-update-smoke.sh +++ b/scripts/e2e/parallels-npm-update-smoke.sh @@ -380,16 +380,25 @@ source_tree_dirty_for_build() { [[ -n "$(git status --porcelain -- src ui packages extensions package.json pnpm-lock.yaml 'tsconfig*.json' 2>/dev/null)" ]] } +current_build_has_control_ui() { + [[ -f dist/control-ui/index.html ]] || return 1 + compgen -G "dist/control-ui/assets/*" >/dev/null +} + ensure_current_build() { local build_commit head rc head="$(git rev-parse HEAD)" build_commit="$(current_build_commit)" - if [[ "$build_commit" == "$head" ]] && ! source_tree_dirty_for_build; then + if [[ "$build_commit" == "$head" ]] && ! source_tree_dirty_for_build && current_build_has_control_ui; then return 0 fi say "Build dist for current head" pnpm build rc=$? + if [[ $rc -eq 0 ]]; then + pnpm ui:build + rc=$? + fi if [[ $rc -eq 0 ]]; then parallels_package_assert_no_generated_drift rc=$? @@ -525,6 +534,30 @@ function Invoke-CaptureLogged { return ($output | Out-String).Trim() } +function Test-GatewayListenerReady { + $listeners = Get-NetTCPConnection -LocalPort 18789 -State Listen -ErrorAction SilentlyContinue + return [bool]$listeners +} + +function Test-GatewayLogReady { + $logDir = Join-Path $env:LOCALAPPDATA 'Temp\openclaw' + if (-not (Test-Path $logDir)) { + return $false + } + $logFile = Get-ChildItem -Path $logDir -Filter 'openclaw-*.log' -File -ErrorAction SilentlyContinue | + Sort-Object LastWriteTime -Descending | + Select-Object -First 1 + if (-not $logFile) { + return $false + } + try { + $tail = Get-Content -Path $logFile.FullName -Tail 120 -ErrorAction Stop | Out-String + } catch { + return $false + } + return $tail -match '"ready \(' +} + function Wait-GatewayRpcReady { param( [Parameter(Mandatory = $true)][string]$OpenClawPath, @@ -534,11 +567,17 @@ function Wait-GatewayRpcReady { for ($attempt = 1; $attempt -le $Attempts; $attempt++) { Write-ProgressLog "update.gateway-status.attempt-$attempt" + if ((Test-GatewayListenerReady) -and (Test-GatewayLogReady)) { + Write-ProgressLog "update.gateway-status.ready-log-$attempt" + return $true + } try { - $statusOutput = Invoke-CaptureLogged 'openclaw gateway status' { & $OpenClawPath gateway status --deep --require-rpc } - if ($statusOutput -match 'Read probe:\s*failed') { - throw 'gateway status returned without RPC read readiness' + $probeOutput = Invoke-CaptureLogged 'openclaw gateway probe' { & $OpenClawPath gateway probe --url ws://127.0.0.1:18789 --timeout 5000 --json } + $probe = $probeOutput | ConvertFrom-Json + if (-not $probe.ok) { + throw 'gateway probe returned without RPC readiness' } + Invoke-CaptureLogged 'openclaw gateway status' { & $OpenClawPath gateway status --deep --require-rpc } | Out-Null return $true } catch { if ($attempt -ge $Attempts) { @@ -674,12 +713,11 @@ function Invoke-OpenClawUpdateWithTimeout { param( [Parameter(Mandatory = $true)][string]$OpenClawPath, [Parameter(Mandatory = $true)][string]$UpdateTarget, - [int]$TimeoutSeconds = 600 + [int]$TimeoutSeconds = 1200 ) $updateJob = Start-Job -ScriptBlock { param([string]$Path, [string]$Target) - $env:OPENCLAW_DISABLE_BUNDLED_PLUGINS = '1' $output = & $Path update --tag $Target --yes --json *>&1 [pscustomobject]@{ ExitCode = $LASTEXITCODE @@ -707,6 +745,65 @@ function Invoke-OpenClawUpdateWithTimeout { Stop-OpenClawUpdateProcesses } +function Invoke-OpenClawAgentWithTimeout { + param( + [Parameter(Mandatory = $true)][string]$OpenClawPath, + [Parameter(Mandatory = $true)][string]$SessionId, + [int]$TimeoutSeconds = 600 + ) + + $message = 'Reply with exact ASCII text OK only.' + $stdout = Join-Path $env:TEMP ("openclaw-parallels-agent-{0}.out.log" -f ([guid]::NewGuid().ToString('N'))) + $stderr = Join-Path $env:TEMP ("openclaw-parallels-agent-{0}.err.log" -f ([guid]::NewGuid().ToString('N'))) + $agentJob = Start-Job -ScriptBlock { + param([string]$Path, [string]$AgentSessionId, [string]$AgentMessage, [string]$StdoutPath, [string]$StderrPath) + & $Path agent --local --agent main --session-id $AgentSessionId --message $AgentMessage --json > $StdoutPath 2> $StderrPath + exit $LASTEXITCODE + } -ArgumentList $OpenClawPath, $SessionId, $message, $stdout, $stderr + $deadline = (Get-Date).AddSeconds($TimeoutSeconds) + $combined = '' + while ((Get-Date) -lt $deadline) { + Start-Sleep -Seconds 2 + $out = '' + $err = '' + if (Test-Path $stdout) { + $out = Get-Content -Path $stdout -Raw -ErrorAction SilentlyContinue + } + if (Test-Path $stderr) { + $err = Get-Content -Path $stderr -Raw -ErrorAction SilentlyContinue + } + $combined = "$out`n$err" + if ($combined -match '"finalAssistantRawText":\s*"OK"' -or $combined -match '"finalAssistantVisibleText":\s*"OK"') { + if ($combined.Trim().Length -gt 0) { + $combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null + } + Stop-Job $agentJob -ErrorAction SilentlyContinue + Remove-Job $agentJob -Force -ErrorAction SilentlyContinue + return 0 + } + if ($agentJob.State -in @('Completed', 'Failed', 'Stopped')) { + if ($combined.Trim().Length -gt 0) { + $combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null + } + Receive-Job $agentJob -ErrorAction SilentlyContinue | Out-Null + $jobState = $agentJob.State + Remove-Job $agentJob -Force -ErrorAction SilentlyContinue + if ($jobState -ne 'Completed') { + throw "openclaw agent failed with job state $jobState" + } + throw 'openclaw agent finished without OK response' + } + } + + Stop-Job $agentJob -ErrorAction SilentlyContinue + Remove-Job $agentJob -Force -ErrorAction SilentlyContinue + Write-ProgressLog 'update.agent-turn.timeout' + if ($combined.Trim().Length -gt 0) { + $combined.Trim() | Tee-Object -FilePath $LogPath -Append | Out-Null + } + throw "openclaw agent timed out after ${TimeoutSeconds}s" +} + function Start-GatewayRunFallback { param( [Parameter(Mandatory = $true)][string]$OpenClawPath @@ -836,13 +933,10 @@ try { # an explicit start only if the RPC endpoint never returns. Write-ProgressLog 'update.restart-gateway' Restart-GatewayWithRecovery -OpenClawPath $openclaw + Stop-OpenClawGatewayProcesses Complete-WorkspaceSetup Write-ProgressLog 'update.agent-turn' - Invoke-CaptureLogged 'openclaw agent' { & $openclaw agent --local --agent main --session-id $SessionId --message 'Reply with exact ASCII text OK only.' --json } | Out-Null - $exitCode = $LASTEXITCODE - if ($null -eq $exitCode) { - $exitCode = 0 - } + $exitCode = Invoke-OpenClawAgentWithTimeout -OpenClawPath $openclaw -SessionId $SessionId Write-ProgressLog 'update.done' Set-Content -Path $DonePath -Value ([string]$exitCode) exit $exitCode @@ -921,7 +1015,19 @@ verify_macos_update_after_transport_loss() { cat </dev/null set -euo pipefail export PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin +export OPENCLAW_PLUGIN_STAGE_DIR="\$HOME/.openclaw/plugin-runtime-deps-parallels" busy="\$(/bin/ps -axo command | /usr/bin/egrep 'openclaw update|npm install|pnpm install|pnpm run build' | /usr/bin/egrep -v 'egrep|openclaw-npm-update-macos-recover' || true)" +gateway_listener_ready() { + /usr/sbin/lsof -tiTCP:18789 -sTCP:LISTEN >/dev/null 2>&1 +} +gateway_log_ready() { + latest="\$(/bin/ls -t /tmp/openclaw/openclaw-*.log 2>/dev/null | /usr/bin/head -n 1 || true)" + [ -n "\$latest" ] || return 1 + /usr/bin/tail -n 160 "\$latest" | /usr/bin/grep -q 'ready (' +} +gateway_smoke_ready() { + gateway_listener_ready && gateway_log_ready +} if [ -n "\$busy" ]; then printf 'update still has active npm/pnpm/openclaw processes\n%s\n' "\$busy" >&2 exit 1 @@ -937,10 +1043,10 @@ if [ -n "$expected_needle" ]; then ;; esac fi -/opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1 || /opt/homebrew/bin/openclaw gateway restart || true +gateway_smoke_ready || /opt/homebrew/bin/openclaw gateway restart || true gateway_ready=0 for _ in 1 2 3 4 5 6; do - if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc; then + if gateway_smoke_ready; then gateway_ready=1 break fi @@ -949,7 +1055,7 @@ done if [ "\$gateway_ready" != "1" ]; then /opt/homebrew/bin/openclaw gateway start || true for _ in 1 2 3 4 5 6; do - if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc; then + if gateway_smoke_ready; then gateway_ready=1 break fi @@ -957,7 +1063,7 @@ if [ "\$gateway_ready" != "1" ]; then done fi if [ "\$gateway_ready" != "1" ]; then - echo "gateway did not become RPC-ready after transport recovery" >&2 + echo "gateway did not become ready after transport recovery" >&2 exit 1 fi workspace="\${OPENCLAW_WORKSPACE_DIR:-\$HOME/.openclaw/workspace}" @@ -993,7 +1099,7 @@ print(base64.b64encode(os.environ["PROVIDER_KEY"].encode("utf-8")).decode("ascii PY )" set +e - guest_powershell_poll 120 "$(cat < \$StdoutPath 2> \$StderrPath + exit \$LASTEXITCODE +} -ArgumentList \$openclaw, \$agentStdout, \$agentStderr +\$agentDeadline = (Get-Date).AddSeconds(600) +\$agentCombined = '' +while ((Get-Date) -lt \$agentDeadline) { + Start-Sleep -Seconds 2 + \$agentOut = '' + \$agentErr = '' + if (Test-Path \$agentStdout) { + \$agentOut = Get-Content -Path \$agentStdout -Raw -ErrorAction SilentlyContinue + } + if (Test-Path \$agentStderr) { + \$agentErr = Get-Content -Path \$agentStderr -Raw -ErrorAction SilentlyContinue + } + \$agentCombined = \$agentOut + [Environment]::NewLine + \$agentErr + if (\$agentCombined -match '"finalAssistantRawText":\s*"OK"' -or \$agentCombined -match '"finalAssistantVisibleText":\s*"OK"') { + if (\$agentCombined.Trim().Length -gt 0) { + \$agentCombined.Trim() | Write-Output + } + Stop-Job \$agentJob -ErrorAction SilentlyContinue + Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue + \$agentJob = \$null + break + } + if (\$agentJob.State -in @('Completed', 'Failed', 'Stopped')) { + if (\$agentCombined.Trim().Length -gt 0) { + \$agentCombined.Trim() | Write-Output + } + Receive-Job \$agentJob -ErrorAction SilentlyContinue | Out-Null + \$agentJobState = \$agentJob.State + Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue + \$agentJob = \$null + if (\$agentJobState -ne 'Completed') { + throw "openclaw agent failed with job state \$agentJobState" + } + throw 'openclaw agent finished without OK response' + break + } +} +if (\$null -ne \$agentJob) { + Stop-Job \$agentJob -ErrorAction SilentlyContinue + Remove-Job \$agentJob -Force -ErrorAction SilentlyContinue + if (\$agentCombined.Trim().Length -gt 0) { + \$agentCombined.Trim() | Write-Output + } + throw 'openclaw agent timed out after 600s' +} EOF )" local rc=$? @@ -1146,14 +1304,24 @@ start_timeout_guard() { if [[ -n "$log_path" ]]; then dump_log_tail "$label" "$log_path" fi - kill "$pid" >/dev/null 2>&1 || true + terminate_process_tree "$pid" TERM sleep 2 - kill -9 "$pid" >/dev/null 2>&1 || true + terminate_process_tree "$pid" KILL fi ) >&2 & printf '%s\n' "$!" } +terminate_process_tree() { + local pid="$1" + local signal_name="${2:-TERM}" + local child + pgrep -P "$pid" 2>/dev/null | while read -r child; do + terminate_process_tree "$child" "$signal_name" + done + kill "-$signal_name" "$pid" >/dev/null 2>&1 || true +} + stop_timeout_guard() { local pid="${1:-}" [[ -n "$pid" ]] || return 0 @@ -1210,27 +1378,50 @@ host_timeout_exec() { shift HOST_TIMEOUT_S="$timeout_s" "$PYTHON_BIN" - "$@" <<'PY' import os +import signal import subprocess import sys timeout = int(os.environ["HOST_TIMEOUT_S"]) args = sys.argv[1:] +process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, +) try: - completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) -except subprocess.TimeoutExpired as exc: - if exc.stdout: - sys.stdout.buffer.write(exc.stdout) - if exc.stderr: - sys.stderr.buffer.write(exc.stderr) + stdout, stderr = process.communicate(timeout=timeout) +except subprocess.TimeoutExpired: + try: + os.killpg(process.pid, signal.SIGTERM) + except ProcessLookupError: + pass + except PermissionError: + pass + try: + stdout, stderr = process.communicate(timeout=2) + except subprocess.TimeoutExpired: + try: + os.killpg(process.pid, signal.SIGKILL) + except ProcessLookupError: + pass + except PermissionError: + pass + stdout, stderr = process.communicate() + if stdout: + sys.stdout.buffer.write(stdout) + if stderr: + sys.stderr.buffer.write(stderr) sys.stderr.write(f"host timeout after {timeout}s\n") raise SystemExit(124) -if completed.stdout: - sys.stdout.buffer.write(completed.stdout) -if completed.stderr: - sys.stderr.buffer.write(completed.stderr) -raise SystemExit(completed.returncode) +if stdout: + sys.stdout.buffer.write(stdout) +if stderr: + sys.stderr.buffer.write(stderr) +raise SystemExit(process.returncode) PY } @@ -1401,11 +1592,23 @@ run_macos_update() { set -euo pipefail export PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin if [ -z "\${HOME:-}" ]; then export HOME="/Users/\$(id -un)"; fi +export OPENCLAW_PLUGIN_STAGE_DIR="\$HOME/.openclaw/plugin-runtime-deps-parallels" if [ -z "\${$API_KEY_ENV:-}" ]; then echo "$API_KEY_ENV is required in the macOS update environment" >&2 exit 1 fi cd "\$HOME" +gateway_listener_ready() { + /usr/sbin/lsof -tiTCP:18789 -sTCP:LISTEN >/dev/null 2>&1 +} +gateway_log_ready() { + latest="\$(/bin/ls -t /tmp/openclaw/openclaw-*.log 2>/dev/null | /usr/bin/head -n 1 || true)" + [ -n "\$latest" ] || return 1 + /usr/bin/tail -n 160 "\$latest" | /usr/bin/grep -q 'ready (' +} +gateway_smoke_ready() { + gateway_listener_ready && gateway_log_ready +} scrub_future_plugin_entries() { node - <<'JS' || true const fs = require("fs"); @@ -1446,7 +1649,7 @@ stop_openclaw_gateway_processes() { # host can observe new plugin metadata mid-update and abort config validation. scrub_future_plugin_entries stop_openclaw_gateway_processes -OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 /opt/homebrew/bin/openclaw update --tag "$update_target" --yes --json +/opt/homebrew/bin/openclaw update --tag "$update_target" --yes --json # Same-guest npm upgrades can leave the old gateway process holding the old # bundled plugin host version. Stop it before post-update config commands. stop_openclaw_gateway_processes @@ -1470,7 +1673,7 @@ fi /opt/homebrew/bin/openclaw gateway restart || true gateway_ready=0 for _ in 1 2 3 4 5 6 7 8; do - if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1; then + if gateway_smoke_ready; then gateway_ready=1 break fi @@ -1480,7 +1683,7 @@ if [ "\$gateway_ready" != "1" ]; then stop_openclaw_gateway_processes /opt/homebrew/bin/openclaw gateway run --bind loopback --port 18789 --force >/tmp/openclaw-parallels-npm-update-macos-gateway.log 2>&1 /dev/null 2>&1; then + if gateway_smoke_ready; then gateway_ready=1 break fi @@ -1490,7 +1693,9 @@ fi if [ "\$gateway_ready" != "1" ]; then tail -n 120 /tmp/openclaw-parallels-npm-update-macos-gateway.log 2>/dev/null || true fi -/opt/homebrew/bin/openclaw gateway status --deep --require-rpc +if [ "\$gateway_ready" != "1" ]; then + /opt/homebrew/bin/openclaw gateway status --deep --require-rpc +fi workspace="\${OPENCLAW_WORKSPACE_DIR:-\$HOME/.openclaw/workspace}" mkdir -p "\$workspace/.openclaw" cat > "\$workspace/IDENTITY.md" <<'IDENTITY_EOF' @@ -1577,7 +1782,7 @@ stop_openclaw_gateway_processes() { # the old host can observe new plugin metadata mid-update and abort validation. scrub_future_plugin_entries stop_openclaw_gateway_processes -OPENCLAW_DISABLE_BUNDLED_PLUGINS=1 openclaw update --tag "$update_target" --yes --json +openclaw update --tag "$update_target" --yes --json # The fresh Linux lane starts a manual gateway; stop the old process before # post-update config validation sees mixed old-host/new-plugin metadata. stop_openclaw_gateway_processes diff --git a/scripts/e2e/parallels-windows-smoke.sh b/scripts/e2e/parallels-windows-smoke.sh index 6d1efb945fe..607445b7bd6 100644 --- a/scripts/e2e/parallels-windows-smoke.sh +++ b/scripts/e2e/parallels-windows-smoke.sh @@ -413,27 +413,50 @@ host_timeout_exec() { shift HOST_TIMEOUT_S="$timeout_s" python3 - "$@" <<'PY' import os +import signal import subprocess import sys timeout = int(os.environ["HOST_TIMEOUT_S"]) args = sys.argv[1:] +process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, +) try: - completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) -except subprocess.TimeoutExpired as exc: - if exc.stdout: - sys.stdout.buffer.write(exc.stdout) - if exc.stderr: - sys.stderr.buffer.write(exc.stderr) + stdout, stderr = process.communicate(timeout=timeout) +except subprocess.TimeoutExpired: + try: + os.killpg(process.pid, signal.SIGTERM) + except ProcessLookupError: + pass + except PermissionError: + pass + try: + stdout, stderr = process.communicate(timeout=2) + except subprocess.TimeoutExpired: + try: + os.killpg(process.pid, signal.SIGKILL) + except ProcessLookupError: + pass + except PermissionError: + pass + stdout, stderr = process.communicate() + if stdout: + sys.stdout.buffer.write(stdout) + if stderr: + sys.stderr.buffer.write(stderr) sys.stderr.write(f"host timeout after {timeout}s\n") raise SystemExit(124) -if completed.stdout: - sys.stdout.buffer.write(completed.stdout) -if completed.stderr: - sys.stderr.buffer.write(completed.stderr) -raise SystemExit(completed.returncode) +if stdout: + sys.stdout.buffer.write(stdout) +if stderr: + sys.stderr.buffer.write(stderr) +raise SystemExit(process.returncode) PY } @@ -690,6 +713,16 @@ show_log_excerpt() { tail -n 80 "$log_path" >&2 || true } +terminate_process_tree() { + local pid="$1" + local signal_name="${2:-TERM}" + local child + pgrep -P "$pid" 2>/dev/null | while read -r child; do + terminate_process_tree "$child" "$signal_name" + done + kill "-$signal_name" "$pid" >/dev/null 2>&1 || true +} + phase_run() { local phase_id="$1" local timeout_s="$2" @@ -716,9 +749,9 @@ phase_run() { fi if (( SECONDS - start >= timeout_s )); then timed_out=1 - kill "$pid" >/dev/null 2>&1 || true + terminate_process_tree "$pid" TERM sleep 2 - kill -9 "$pid" >/dev/null 2>&1 || true + terminate_process_tree "$pid" KILL break fi sleep 1