test: improve parallels smoke diagnostics

2026-04-16 03:31:10 +00:00 · 2026-04-06 14:41:19 +01:00
parent 50082f91ff
commit 53f86745e1
3 changed files with 103 additions and 9 deletions
--- a/.agents/skills/openclaw-parallels-smoke/SKILL.md
+++ b/.agents/skills/openclaw-parallels-smoke/SKILL.md
@@ -17,6 +17,7 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
 - Per-phase logs land under `/tmp/openclaw-parallels-*`.
 - Do not run local and gateway agent turns in parallel on the same fresh workspace or session.
 - If `main` is moving under active multi-agent work, prefer a detached worktree pinned to one commit for long Parallels suites. The smoke scripts now verify the packed tgz commit instead of live `git rev-parse HEAD`, but a pinned worktree still avoids noisy rebuild/version drift during reruns.
+- For `openclaw update --channel dev` lanes, remember the guest clones GitHub `main`, not your local worktree. If a local fix exists but the rerun still fails inside the cloned dev checkout, do not treat that as disproof of the fix until the branch has been pushed.
 - For `prlctl exec`, pass the VM name before `--current-user` (`prlctl exec "$VM" --current-user ...`), not the other way around.
 - If the workflow installs OpenClaw from a repo checkout instead of the site installer/npm release, finish by installing a real guest CLI shim and verifying it in a fresh guest shell. `pnpm openclaw ...` inside the repo is not enough for handoff parity.
 - On macOS guests, prefer a user-global install plus a stable PATH-visible shim:
@@ -79,7 +80,9 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
 - Windows installer/tgz phases now retry once after guest-ready recheck; keep new Windows smoke steps idempotent so a transport-flake retry is safe.
 - If a Windows retry sees the VM become `suspended` or `stopped`, resume/start it before the next `prlctl exec`; otherwise the second attempt just repeats the same `rc=255`.
 - Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips.
+- When those Windows global installs stay quiet, the useful progress often lives in the guest npm debug log, not the helper phase log. The smoke script now streams incremental `npm-cache/_logs/*-debug-0.log` deltas into the phase log during long baseline/package installs; read those lines before assuming the lane is stalled.
 - The Windows baseline-package helpers now auto-dump the latest guest `npm-cache/_logs/*-debug-0.log` tail on timeout or nonzero completion. Read that tail in the phase log before opening a second guest shell.
+- The same incremental npm-debug streaming also applies to `--upgrade-from-packed-main` / packaged-install baseline phases. A phase log that still says only `install.start`, `install.download-tgz`, `install.install-tgz` can still be healthy if the streamed npm-debug section shows registry fetches or bundled-plugin postinstall work.
 - Fresh Windows tgz install phases should also use the background PowerShell runner plus done-file/log-drain pattern; do not rely on one long-lived `prlctl exec ... powershell ... npm install -g` transport for package installs.
 - Windows release-to-dev helpers should log `where pnpm` before and after the update and require `where pnpm` to succeed post-update. That proves the updater installed or enabled `pnpm` itself instead of depending on a smoke-only bootstrap.
 - Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes.
--- a/scripts/e2e/parallels-macos-smoke.sh
+++ b/scripts/e2e/parallels-macos-smoke.sh
@@ -41,6 +41,7 @@ RUN_DIR="$(mktemp -d /tmp/openclaw-parallels-smoke.XXXXXX)"
 BUILD_LOCK_DIR="${TMPDIR:-/tmp}/openclaw-parallels-build.lock"

 TIMEOUT_INSTALL_S=900
+TIMEOUT_UPDATE_DEV_S=1500
 TIMEOUT_VERIFY_S=60
 TIMEOUT_ONBOARD_S=180
 TIMEOUT_GATEWAY_S=60
@@ -708,13 +709,24 @@ run_dev_channel_update() {
  update_entry="$update_root/openclaw.mjs"
  ensure_guest_pnpm_for_dev_update
  printf 'update-dev: run\n'
-  guest_current_user_exec /bin/rm -rf "$update_root"
-  guest_current_user_exec_path "$bootstrap_bin:$GUEST_EXEC_PATH" \
-    "$GUEST_NODE_BIN" "$GUEST_OPENCLAW_ENTRY" update --channel dev --yes --json
+  guest_current_user_sh "$(cat <<EOF
+rm -rf $(shell_quote "$update_root")
+export PATH=$(shell_quote "$bootstrap_bin:$GUEST_EXEC_PATH")
+$GUEST_NODE_BIN $GUEST_OPENCLAW_ENTRY update --channel dev --yes --json
+EOF
+)"
  printf 'update-dev: git-version\n'
-  guest_current_user_node_cli "$update_entry" --version
+  guest_current_user_sh "$(cat <<EOF
+export PATH=$(shell_quote "$GUEST_EXEC_PATH")
+$GUEST_NODE_BIN $(shell_quote "$update_entry") --version
+EOF
+)"
  printf 'update-dev: git-status\n'
-  guest_current_user_node_cli "$update_entry" update status --json
+  guest_current_user_sh "$(cat <<EOF
+export PATH=$(shell_quote "$GUEST_EXEC_PATH")
+$GUEST_NODE_BIN $(shell_quote "$update_entry") update status --json
+EOF
+)"
 }

 verify_dev_channel_update() {
@@ -1358,7 +1370,7 @@ run_upgrade_lane() {
    phase_run "upgrade.verify-main-version" "$TIMEOUT_VERIFY_S" verify_target_version
    phase_run "upgrade.verify-bundle-permissions" "$TIMEOUT_PERMISSION_S" verify_bundle_permissions
  else
-    phase_run "upgrade.update-dev" "$TIMEOUT_INSTALL_S" run_dev_channel_update
+    phase_run "upgrade.update-dev" "$TIMEOUT_UPDATE_DEV_S" run_dev_channel_update
    UPGRADE_MAIN_VERSION="$(extract_last_version "$(phase_log_path upgrade.update-dev)")"
    phase_run "upgrade.verify-dev-channel" "$TIMEOUT_VERIFY_S" verify_dev_channel_update
  fi
--- a/scripts/e2e/parallels-windows-smoke.sh
+++ b/scripts/e2e/parallels-windows-smoke.sh
@@ -494,6 +494,57 @@ EOF
  printf '%s\n' "$npm_log"
 }

+stream_latest_guest_npm_log_tail_delta() {
+  local label="$1"
+  local state_path="$2"
+  local npm_log rc
+  set +e
+  npm_log="$(
+    guest_powershell_poll 20 "$(cat <<'EOF'
+$logDir = Join-Path $env:LOCALAPPDATA 'npm-cache\_logs'
+if (-not (Test-Path $logDir)) {
+  exit 0
+}
+$latest = Get-ChildItem $logDir -Filter '*-debug-0.log' |
+  Sort-Object LastWriteTime -Descending |
+  Select-Object -First 1
+if ($null -eq $latest) {
+  exit 0
+}
+"==> npm-debug-log"
+$latest.FullName
+Get-Content $latest.FullName -Tail 80
+EOF
+)"
+  )"
+  rc=$?
+  set -e
+  if [[ $rc -ne 0 || -z "$npm_log" ]]; then
+    return "$rc"
+  fi
+  GUEST_LOG="$npm_log" python3 - "$state_path" "$label" <<'PY'
+import os
+import pathlib
+import sys
+
+state_path = pathlib.Path(sys.argv[1])
+label = sys.argv[2]
+previous = state_path.read_text(encoding="utf-8", errors="replace")
+current = os.environ["GUEST_LOG"].replace("\r\n", "\n").replace("\r", "\n")
+
+if current.startswith(previous):
+    delta = current[len(previous):]
+else:
+    delta = current
+
+if delta:
+    sys.stdout.write(f"==> {label}\n")
+    sys.stdout.write(delta)
+
+state_path.write_text(current, encoding="utf-8")
+PY
+}
+
 guest_run_openclaw() {
  local env_name="${1:-}"
  local env_value="${2:-}"
@@ -1069,8 +1120,8 @@ install_baseline_npm_release() {
  local version="$2"
  local script_url
  local runner_name log_name done_name done_status launcher_state guest_log
-  local log_state_path
-  local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
+  local log_state_path npm_log_state_path
+  local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc last_npm_log_poll

  write_baseline_npm_install_runner_script
  script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_BASELINE_INSTALL_SCRIPT_PATH")"
@@ -1078,10 +1129,13 @@ install_baseline_npm_release() {
  log_name="openclaw-install-baseline-$RANDOM-$RANDOM.log"
  done_name="openclaw-install-baseline-$RANDOM-$RANDOM.done"
  log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-log-state.XXXXXX")"
+  npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-baseline-npm-log-state.XXXXXX")"
  : >"$log_state_path"
+  : >"$npm_log_state_path"
  start_seconds="$SECONDS"
  poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
  startup_checked=0
+  last_npm_log_poll=0

  guest_powershell_poll 20 "$(cat <<EOF
 \$runner = Join-Path \$env:TEMP '$runner_name'
@@ -1161,6 +1215,7 @@ PY
        dump_latest_guest_npm_log_tail "windows baseline install npm debug tail" || true
      fi
      rm -f "$log_state_path"
+      rm -f "$npm_log_state_path"
      [[ "$done_status" == "0" ]]
      return $?
    fi
@@ -1176,9 +1231,18 @@ PY
      if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
        warn "windows baseline install helper failed to materialize guest files"
        rm -f "$log_state_path"
+        rm -f "$npm_log_state_path"
        return 1
      fi
    fi
+    if (( SECONDS - start_seconds >= 45 && SECONDS - last_npm_log_poll >= 30 )); then
+      if ! stream_latest_guest_npm_log_tail_delta \
+        "windows baseline install npm debug progress" \
+        "$npm_log_state_path"; then
+        :
+      fi
+      last_npm_log_poll=$SECONDS
+    fi
    if (( SECONDS >= poll_deadline )); then
      if ! stream_windows_baseline_install_log; then
        warn "windows baseline install helper log drain failed after timeout"
@@ -1186,6 +1250,7 @@ PY
      dump_latest_guest_npm_log_tail "windows baseline install npm debug tail" || true
      warn "windows baseline install helper timed out waiting for done file"
      rm -f "$log_state_path"
+      rm -f "$npm_log_state_path"
      return 1
    fi
    sleep 2
@@ -1324,7 +1389,7 @@ install_main_tgz() {
  local tgz_url script_url
  local runner_name log_name done_name done_status launcher_state guest_log
  local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
-  local log_state_path
+  local log_state_path npm_log_state_path last_npm_log_poll
  tgz_url="http://$host_ip:$HOST_PORT/$(basename "$MAIN_TGZ_PATH")"
  write_install_runner_script
  script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_INSTALL_SCRIPT_PATH")"
@@ -1332,10 +1397,13 @@ install_main_tgz() {
  log_name="openclaw-install-$RANDOM-$RANDOM.log"
  done_name="openclaw-install-$RANDOM-$RANDOM.done"
  log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-log-state.XXXXXX")"
+  npm_log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-npm-log-state.XXXXXX")"
  : >"$log_state_path"
+  : >"$npm_log_state_path"
  start_seconds="$SECONDS"
  poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
  startup_checked=0
+  last_npm_log_poll=0

  guest_powershell_poll 20 "$(cat <<EOF
 \$runner = Join-Path \$env:TEMP '$runner_name'
@@ -1416,6 +1484,7 @@ PY
        dump_latest_guest_npm_log_tail "windows packaged install npm debug tail" || true
      fi
      rm -f "$log_state_path"
+      rm -f "$npm_log_state_path"
      [[ "$done_status" == "0" ]]
      return $?
    fi
@@ -1431,9 +1500,18 @@ PY
      if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
        warn "windows install helper failed to materialize guest files"
        rm -f "$log_state_path"
+        rm -f "$npm_log_state_path"
        return 1
      fi
    fi
+    if (( SECONDS - start_seconds >= 45 && SECONDS - last_npm_log_poll >= 30 )); then
+      if ! stream_latest_guest_npm_log_tail_delta \
+        "windows packaged install npm debug progress" \
+        "$npm_log_state_path"; then
+        :
+      fi
+      last_npm_log_poll=$SECONDS
+    fi
    if (( SECONDS >= poll_deadline )); then
      if ! stream_windows_install_log; then
        warn "windows install helper log drain failed after timeout"
@@ -1441,6 +1519,7 @@ PY
      dump_latest_guest_npm_log_tail "windows packaged install npm debug tail" || true
      warn "windows install helper timed out waiting for done file"
      rm -f "$log_state_path"
+      rm -f "$npm_log_state_path"
      return 1
    fi
    sleep 2