test: improve parallels smoke diagnostics

This commit is contained in:
Peter Steinberger
2026-04-06 14:41:19 +01:00
parent 50082f91ff
commit 53f86745e1
3 changed files with 103 additions and 9 deletions

View File

@@ -17,6 +17,7 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
- Per-phase logs land under `/tmp/openclaw-parallels-*`.
- Do not run local and gateway agent turns in parallel on the same fresh workspace or session.
- If `main` is moving under active multi-agent work, prefer a detached worktree pinned to one commit for long Parallels suites. The smoke scripts now verify the packed tgz commit instead of live `git rev-parse HEAD`, but a pinned worktree still avoids noisy rebuild/version drift during reruns.
- For `openclaw update --channel dev` lanes, remember the guest clones GitHub `main`, not your local worktree. If a local fix exists but the rerun still fails inside the cloned dev checkout, do not treat that as disproof of the fix until the branch has been pushed.
- For `prlctl exec`, pass the VM name before `--current-user` (`prlctl exec "$VM" --current-user ...`), not the other way around.
- If the workflow installs OpenClaw from a repo checkout instead of the site installer/npm release, finish by installing a real guest CLI shim and verifying it in a fresh guest shell. `pnpm openclaw ...` inside the repo is not enough for handoff parity.
- On macOS guests, prefer a user-global install plus a stable PATH-visible shim:
@@ -79,7 +80,9 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
- Windows installer/tgz phases now retry once after guest-ready recheck; keep new Windows smoke steps idempotent so a transport-flake retry is safe.
- If a Windows retry sees the VM become `suspended` or `stopped`, resume/start it before the next `prlctl exec`; otherwise the second attempt just repeats the same `rc=255`.
- Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips.
- When those Windows global installs stay quiet, the useful progress often lives in the guest npm debug log, not the helper phase log. The smoke script now streams incremental `npm-cache/_logs/*-debug-0.log` deltas into the phase log during long baseline/package installs; read those lines before assuming the lane is stalled.
- The Windows baseline-package helpers now auto-dump the latest guest `npm-cache/_logs/*-debug-0.log` tail on timeout or nonzero completion. Read that tail in the phase log before opening a second guest shell.
- The same incremental npm-debug streaming also applies to `--upgrade-from-packed-main` / packaged-install baseline phases. A phase log that still says only `install.start`, `install.download-tgz`, `install.install-tgz` can still be healthy if the streamed npm-debug section shows registry fetches or bundled-plugin postinstall work.
- Fresh Windows tgz install phases should also use the background PowerShell runner plus done-file/log-drain pattern; do not rely on one long-lived `prlctl exec ... powershell ... npm install -g` transport for package installs.
- Windows release-to-dev helpers should log `where pnpm` before and after the update and require `where pnpm` to succeed post-update. That proves the updater installed or enabled `pnpm` itself instead of depending on a smoke-only bootstrap.
- Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes.

View File

@@ -41,6 +41,7 @@ RUN_DIR="$(mktemp -d /tmp/openclaw-parallels-smoke.XXXXXX)"
BUILD_LOCK_DIR="${TMPDIR:-/tmp}/openclaw-parallels-build.lock"
TIMEOUT_INSTALL_S=900
TIMEOUT_UPDATE_DEV_S=1500
TIMEOUT_VERIFY_S=60
TIMEOUT_ONBOARD_S=180
TIMEOUT_GATEWAY_S=60
@@ -708,13 +709,24 @@ run_dev_channel_update() {
update_entry="$update_root/openclaw.mjs"
ensure_guest_pnpm_for_dev_update
printf 'update-dev: run\n'
guest_current_user_exec /bin/rm -rf "$update_root"
guest_current_user_exec_path "$bootstrap_bin:$GUEST_EXEC_PATH" \
"$GUEST_NODE_BIN" "$GUEST_OPENCLAW_ENTRY" update --channel dev --yes --json
guest_current_user_sh "$(cat <<EOF
rm -rf $(shell_quote "$update_root")
export PATH=$(shell_quote "$bootstrap_bin:$GUEST_EXEC_PATH")
$GUEST_NODE_BIN $GUEST_OPENCLAW_ENTRY update --channel dev --yes --json
EOF
)"
printf 'update-dev: git-version\n'
guest_current_user_node_cli "$update_entry" --version
guest_current_user_sh "$(cat <<EOF
export PATH=$(shell_quote "$GUEST_EXEC_PATH")
$GUEST_NODE_BIN $(shell_quote "$update_entry") --version
EOF
)"
printf 'update-dev: git-status\n'
guest_current_user_node_cli "$update_entry" update status --json
guest_current_user_sh "$(cat <<EOF
export PATH=$(shell_quote "$GUEST_EXEC_PATH")
$GUEST_NODE_BIN $(shell_quote "$update_entry") update status --json
EOF
)"
}
verify_dev_channel_update() {
@@ -1358,7 +1370,7 @@ run_upgrade_lane() {
phase_run "upgrade.verify-main-version" "$TIMEOUT_VERIFY_S" verify_target_version
phase_run "upgrade.verify-bundle-permissions" "$TIMEOUT_PERMISSION_S" verify_bundle_permissions
else
phase_run "upgrade.update-dev" "$TIMEOUT_INSTALL_S" run_dev_channel_update
phase_run "upgrade.update-dev" "$TIMEOUT_UPDATE_DEV_S" run_dev_channel_update
UPGRADE_MAIN_VERSION="$(extract_last_version "$(phase_log_path upgrade.update-dev)")"
phase_run "upgrade.verify-dev-channel" "$TIMEOUT_VERIFY_S" verify_dev_channel_update
fi

View File

@@ -494,6 +494,57 @@ EOF
printf '%s\n' "$npm_log"
}
stream_latest_guest_npm_log_tail_delta() {
local label="$1"
local state_path="$2"
local npm_log rc
set +e
npm_log="$(
guest_powershell_poll 20 "$(cat <<'EOF'
$logDir = Join-Path $env:LOCALAPPDATA 'npm-cache\_logs'
if (-not (Test-Path $logDir)) {
exit 0
}
$latest = Get-ChildItem $logDir -Filter '*-debug-0.log' |
Sort-Object LastWriteTime -Descending |
Select-Object -First 1
if ($null -eq $latest) {
exit 0
}
"==> npm-debug-log"
$latest.FullName
Get-Content $latest.FullName -Tail 80
EOF
)"
)"
rc=$?
set -e
if [[ $rc -ne 0 || -z "$npm_log" ]]; then
return "$rc"
fi
GUEST_LOG="$npm_log" python3 - "$state_path" "$label" <<'PY'
import os
import pathlib
import sys
state_path = pathlib.Path(sys.argv[1])
label = sys.argv[2]
previous = state_path.read_text(encoding="utf-8", errors="replace")
current = os.environ["GUEST_LOG"].replace("\r\n", "\n").replace("\r", "\n")
if current.startswith(previous):
delta = current[len(previous):]
else:
delta = current
if delta:
sys.stdout.write(f"==> {label}\n")
sys.stdout.write(delta)
state_path.write_text(current, encoding="utf-8")
PY
}
guest_run_openclaw() {
local env_name="${1:-}"
local env_value="${2:-}"
@@ -1069,8 +1120,8 @@ install_baseline_npm_release() {
local version="$2"
local script_url
local runner_name log_name done_name done_status launcher_state guest_log
local log_state_path
local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
local log_state_path npm_log_state_path
local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc last_npm_log_poll
write_baseline_npm_install_runner_script
script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_BASELINE_INSTALL_SCRIPT_PATH")"
@@ -1078,10 +1129,13 @@ install_baseline_npm_release() {
log_name="openclaw-install-baseline-$RANDOM-$RANDOM.log"
done_name="openclaw-install-baseline-$RANDOM-$RANDOM.done"
log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-log-state.XXXXXX")"
npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-npm-log-state.XXXXXX")"
: >"$log_state_path"
: >"$npm_log_state_path"
start_seconds="$SECONDS"
poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
startup_checked=0
last_npm_log_poll=0
guest_powershell_poll 20 "$(cat <<EOF
\$runner = Join-Path \$env:TEMP '$runner_name'
@@ -1161,6 +1215,7 @@ PY
dump_latest_guest_npm_log_tail "windows baseline install npm debug tail" || true
fi
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
[[ "$done_status" == "0" ]]
return $?
fi
@@ -1176,9 +1231,18 @@ PY
if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
warn "windows baseline install helper failed to materialize guest files"
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
return 1
fi
fi
if (( SECONDS - start_seconds >= 45 && SECONDS - last_npm_log_poll >= 30 )); then
if ! stream_latest_guest_npm_log_tail_delta \
"windows baseline install npm debug progress" \
"$npm_log_state_path"; then
:
fi
last_npm_log_poll=$SECONDS
fi
if (( SECONDS >= poll_deadline )); then
if ! stream_windows_baseline_install_log; then
warn "windows baseline install helper log drain failed after timeout"
@@ -1186,6 +1250,7 @@ PY
dump_latest_guest_npm_log_tail "windows baseline install npm debug tail" || true
warn "windows baseline install helper timed out waiting for done file"
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
return 1
fi
sleep 2
@@ -1324,7 +1389,7 @@ install_main_tgz() {
local tgz_url script_url
local runner_name log_name done_name done_status launcher_state guest_log
local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
local log_state_path
local log_state_path npm_log_state_path last_npm_log_poll
tgz_url="http://$host_ip:$HOST_PORT/$(basename "$MAIN_TGZ_PATH")"
write_install_runner_script
script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_INSTALL_SCRIPT_PATH")"
@@ -1332,10 +1397,13 @@ install_main_tgz() {
log_name="openclaw-install-$RANDOM-$RANDOM.log"
done_name="openclaw-install-$RANDOM-$RANDOM.done"
log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-log-state.XXXXXX")"
npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-npm-log-state.XXXXXX")"
: >"$log_state_path"
: >"$npm_log_state_path"
start_seconds="$SECONDS"
poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
startup_checked=0
last_npm_log_poll=0
guest_powershell_poll 20 "$(cat <<EOF
\$runner = Join-Path \$env:TEMP '$runner_name'
@@ -1416,6 +1484,7 @@ PY
dump_latest_guest_npm_log_tail "windows packaged install npm debug tail" || true
fi
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
[[ "$done_status" == "0" ]]
return $?
fi
@@ -1431,9 +1500,18 @@ PY
if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
warn "windows install helper failed to materialize guest files"
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
return 1
fi
fi
if (( SECONDS - start_seconds >= 45 && SECONDS - last_npm_log_poll >= 30 )); then
if ! stream_latest_guest_npm_log_tail_delta \
"windows packaged install npm debug progress" \
"$npm_log_state_path"; then
:
fi
last_npm_log_poll=$SECONDS
fi
if (( SECONDS >= poll_deadline )); then
if ! stream_windows_install_log; then
warn "windows install helper log drain failed after timeout"
@@ -1441,6 +1519,7 @@ PY
dump_latest_guest_npm_log_tail "windows packaged install npm debug tail" || true
warn "windows install helper timed out waiting for done file"
rm -f "$log_state_path"
rm -f "$npm_log_state_path"
return 1
fi
sleep 2