From 53f86745e1a2c2accfa06dd0b987e6153b3e97d4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 6 Apr 2026 14:41:19 +0100 Subject: [PATCH] test: improve parallels smoke diagnostics --- .../skills/openclaw-parallels-smoke/SKILL.md | 3 + scripts/e2e/parallels-macos-smoke.sh | 24 ++++-- scripts/e2e/parallels-windows-smoke.sh | 85 ++++++++++++++++++- 3 files changed, 103 insertions(+), 9 deletions(-) diff --git a/.agents/skills/openclaw-parallels-smoke/SKILL.md b/.agents/skills/openclaw-parallels-smoke/SKILL.md index 59012f95560..c6b5580bf35 100644 --- a/.agents/skills/openclaw-parallels-smoke/SKILL.md +++ b/.agents/skills/openclaw-parallels-smoke/SKILL.md @@ -17,6 +17,7 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo - Per-phase logs land under `/tmp/openclaw-parallels-*`. - Do not run local and gateway agent turns in parallel on the same fresh workspace or session. - If `main` is moving under active multi-agent work, prefer a detached worktree pinned to one commit for long Parallels suites. The smoke scripts now verify the packed tgz commit instead of live `git rev-parse HEAD`, but a pinned worktree still avoids noisy rebuild/version drift during reruns. +- For `openclaw update --channel dev` lanes, remember the guest clones GitHub `main`, not your local worktree. If a local fix exists but the rerun still fails inside the cloned dev checkout, do not treat that as disproof of the fix until the branch has been pushed. - For `prlctl exec`, pass the VM name before `--current-user` (`prlctl exec "$VM" --current-user ...`), not the other way around. - If the workflow installs OpenClaw from a repo checkout instead of the site installer/npm release, finish by installing a real guest CLI shim and verifying it in a fresh guest shell. `pnpm openclaw ...` inside the repo is not enough for handoff parity. - On macOS guests, prefer a user-global install plus a stable PATH-visible shim: @@ -79,7 +80,9 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo - Windows installer/tgz phases now retry once after guest-ready recheck; keep new Windows smoke steps idempotent so a transport-flake retry is safe. - If a Windows retry sees the VM become `suspended` or `stopped`, resume/start it before the next `prlctl exec`; otherwise the second attempt just repeats the same `rc=255`. - Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips. +- When those Windows global installs stay quiet, the useful progress often lives in the guest npm debug log, not the helper phase log. The smoke script now streams incremental `npm-cache/_logs/*-debug-0.log` deltas into the phase log during long baseline/package installs; read those lines before assuming the lane is stalled. - The Windows baseline-package helpers now auto-dump the latest guest `npm-cache/_logs/*-debug-0.log` tail on timeout or nonzero completion. Read that tail in the phase log before opening a second guest shell. +- The same incremental npm-debug streaming also applies to `--upgrade-from-packed-main` / packaged-install baseline phases. A phase log that still says only `install.start`, `install.download-tgz`, `install.install-tgz` can still be healthy if the streamed npm-debug section shows registry fetches or bundled-plugin postinstall work. - Fresh Windows tgz install phases should also use the background PowerShell runner plus done-file/log-drain pattern; do not rely on one long-lived `prlctl exec ... powershell ... npm install -g` transport for package installs. - Windows release-to-dev helpers should log `where pnpm` before and after the update and require `where pnpm` to succeed post-update. That proves the updater installed or enabled `pnpm` itself instead of depending on a smoke-only bootstrap. - Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes. diff --git a/scripts/e2e/parallels-macos-smoke.sh b/scripts/e2e/parallels-macos-smoke.sh index 25049054136..b3416e08f65 100644 --- a/scripts/e2e/parallels-macos-smoke.sh +++ b/scripts/e2e/parallels-macos-smoke.sh @@ -41,6 +41,7 @@ RUN_DIR="$(mktemp -d /tmp/openclaw-parallels-smoke.XXXXXX)" BUILD_LOCK_DIR="${TMPDIR:-/tmp}/openclaw-parallels-build.lock" TIMEOUT_INSTALL_S=900 +TIMEOUT_UPDATE_DEV_S=1500 TIMEOUT_VERIFY_S=60 TIMEOUT_ONBOARD_S=180 TIMEOUT_GATEWAY_S=60 @@ -708,13 +709,24 @@ run_dev_channel_update() { update_entry="$update_root/openclaw.mjs" ensure_guest_pnpm_for_dev_update printf 'update-dev: run\n' - guest_current_user_exec /bin/rm -rf "$update_root" - guest_current_user_exec_path "$bootstrap_bin:$GUEST_EXEC_PATH" \ - "$GUEST_NODE_BIN" "$GUEST_OPENCLAW_ENTRY" update --channel dev --yes --json + guest_current_user_sh "$(cat < npm-debug-log" +$latest.FullName +Get-Content $latest.FullName -Tail 80 +EOF +)" + )" + rc=$? + set -e + if [[ $rc -ne 0 || -z "$npm_log" ]]; then + return "$rc" + fi + GUEST_LOG="$npm_log" python3 - "$state_path" "$label" <<'PY' +import os +import pathlib +import sys + +state_path = pathlib.Path(sys.argv[1]) +label = sys.argv[2] +previous = state_path.read_text(encoding="utf-8", errors="replace") +current = os.environ["GUEST_LOG"].replace("\r\n", "\n").replace("\r", "\n") + +if current.startswith(previous): + delta = current[len(previous):] +else: + delta = current + +if delta: + sys.stdout.write(f"==> {label}\n") + sys.stdout.write(delta) + +state_path.write_text(current, encoding="utf-8") +PY +} + guest_run_openclaw() { local env_name="${1:-}" local env_value="${2:-}" @@ -1069,8 +1120,8 @@ install_baseline_npm_release() { local version="$2" local script_url local runner_name log_name done_name done_status launcher_state guest_log - local log_state_path - local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc + local log_state_path npm_log_state_path + local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc last_npm_log_poll write_baseline_npm_install_runner_script script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_BASELINE_INSTALL_SCRIPT_PATH")" @@ -1078,10 +1129,13 @@ install_baseline_npm_release() { log_name="openclaw-install-baseline-$RANDOM-$RANDOM.log" done_name="openclaw-install-baseline-$RANDOM-$RANDOM.done" log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-log-state.XXXXXX")" + npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-npm-log-state.XXXXXX")" : >"$log_state_path" + : >"$npm_log_state_path" start_seconds="$SECONDS" poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60)) startup_checked=0 + last_npm_log_poll=0 guest_powershell_poll 20 "$(cat <= 45 && SECONDS - last_npm_log_poll >= 30 )); then + if ! stream_latest_guest_npm_log_tail_delta \ + "windows baseline install npm debug progress" \ + "$npm_log_state_path"; then + : + fi + last_npm_log_poll=$SECONDS + fi if (( SECONDS >= poll_deadline )); then if ! stream_windows_baseline_install_log; then warn "windows baseline install helper log drain failed after timeout" @@ -1186,6 +1250,7 @@ PY dump_latest_guest_npm_log_tail "windows baseline install npm debug tail" || true warn "windows baseline install helper timed out waiting for done file" rm -f "$log_state_path" + rm -f "$npm_log_state_path" return 1 fi sleep 2 @@ -1324,7 +1389,7 @@ install_main_tgz() { local tgz_url script_url local runner_name log_name done_name done_status launcher_state guest_log local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc - local log_state_path + local log_state_path npm_log_state_path last_npm_log_poll tgz_url="http://$host_ip:$HOST_PORT/$(basename "$MAIN_TGZ_PATH")" write_install_runner_script script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_INSTALL_SCRIPT_PATH")" @@ -1332,10 +1397,13 @@ install_main_tgz() { log_name="openclaw-install-$RANDOM-$RANDOM.log" done_name="openclaw-install-$RANDOM-$RANDOM.done" log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-log-state.XXXXXX")" + npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-npm-log-state.XXXXXX")" : >"$log_state_path" + : >"$npm_log_state_path" start_seconds="$SECONDS" poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60)) startup_checked=0 + last_npm_log_poll=0 guest_powershell_poll 20 "$(cat <= 45 && SECONDS - last_npm_log_poll >= 30 )); then + if ! stream_latest_guest_npm_log_tail_delta \ + "windows packaged install npm debug progress" \ + "$npm_log_state_path"; then + : + fi + last_npm_log_poll=$SECONDS + fi if (( SECONDS >= poll_deadline )); then if ! stream_windows_install_log; then warn "windows install helper log drain failed after timeout" @@ -1441,6 +1519,7 @@ PY dump_latest_guest_npm_log_tail "windows packaged install npm debug tail" || true warn "windows install helper timed out waiting for done file" rm -f "$log_state_path" + rm -f "$npm_log_state_path" return 1 fi sleep 2