fix: harden parallels smoke harness

2026-04-21 22:21:33 +00:00 · 2026-04-04 14:18:07 +01:00
parent 99e45eb3ba
commit 79be1e126a
5 changed files with 107 additions and 2 deletions
--- a/scripts/e2e/parallels-linux-smoke.sh
+++ b/scripts/e2e/parallels-linux-smoke.sh
@@ -634,6 +634,20 @@ setsid sh -lc 'exec env OPENCLAW_HOME=/root OPENCLAW_STATE_DIR=/root/.openclaw O
 EOF
 )"
  guest_exec bash -lc "$cmd"
+
+  # On the Ubuntu guest the backgrounded process can bind a few seconds after
+  # the launch command returns. Keep the race inside gateway-start instead of
+  # failing the next phase with a false-negative RPC probe.
+  local deadline
+  deadline=$((SECONDS + TIMEOUT_GATEWAY_S))
+  while (( SECONDS < deadline )); do
+    if show_gateway_status_compat >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 2
+  done
+
+  return 1
 }

 show_gateway_status_compat() {
--- a/scripts/e2e/parallels-macos-smoke.sh
+++ b/scripts/e2e/parallels-macos-smoke.sh
@@ -474,6 +474,62 @@ wait_for_current_user() {
  return 1
 }

+host_timeout_exec() {
+  local timeout_s="$1"
+  shift
+  HOST_TIMEOUT_S="$timeout_s" python3 - "$@" <<'PY'
+import os
+import subprocess
+import sys
+
+timeout = int(os.environ["HOST_TIMEOUT_S"])
+args = sys.argv[1:]
+
+try:
+    completed = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
+except subprocess.TimeoutExpired as exc:
+    if exc.stdout:
+        sys.stdout.buffer.write(exc.stdout)
+    if exc.stderr:
+        sys.stderr.buffer.write(exc.stderr)
+    sys.stderr.write(f"host timeout after {timeout}s\n")
+    raise SystemExit(124)
+
+if completed.stdout:
+    sys.stdout.buffer.write(completed.stdout)
+if completed.stderr:
+    sys.stderr.buffer.write(completed.stderr)
+raise SystemExit(completed.returncode)
+PY
+}
+
+snapshot_switch_with_retry() {
+  local snapshot_id="$1"
+  local attempt rc status
+  rc=0
+  for attempt in 1 2; do
+    set +e
+    host_timeout_exec "$TIMEOUT_SNAPSHOT_S" prlctl snapshot-switch "$VM_NAME" --id "$snapshot_id" >/dev/null
+    rc=$?
+    set -e
+    if [[ $rc -eq 0 ]]; then
+      return 0
+    fi
+    # Tahoe occasionally gets stuck mid snapshot-switch and leaves the guest
+    # running or suspended. Reset that state and try once more before failing
+    # the whole lane.
+    warn "snapshot-switch attempt $attempt failed (rc=$rc)"
+    status="$(prlctl status "$VM_NAME" 2>/dev/null || true)"
+    [[ -n "$status" ]] && warn "vm status after snapshot-switch failure: $status"
+    if [[ "$status" == *" running" || "$status" == *" suspended" ]]; then
+      prlctl stop "$VM_NAME" --kill >/dev/null 2>&1 || true
+      wait_for_vm_status "stopped" || true
+    fi
+    sleep 3
+  done
+  return "$rc"
+}
+
 guest_current_user_exec() {
  prlctl exec "$VM_NAME" --current-user /usr/bin/env \
    PATH=/opt/homebrew/bin:/opt/homebrew/opt/node/bin:/opt/homebrew/sbin:/usr/bin:/bin:/usr/sbin:/sbin \
@@ -551,7 +607,7 @@ guest_current_user_sh() {
 restore_snapshot() {
  local snapshot_id="$1"
  say "Restore snapshot $SNAPSHOT_HINT ($snapshot_id)"
-  prlctl snapshot-switch "$VM_NAME" --id "$snapshot_id" >/dev/null
+  snapshot_switch_with_retry "$snapshot_id" || die "snapshot switch failed for $VM_NAME"
  if [[ "$SNAPSHOT_STATE" == "poweroff" ]]; then
    wait_for_vm_status "stopped" || die "restored poweroff snapshot did not reach stopped state in $VM_NAME"
    say "Start restored poweroff snapshot $SNAPSHOT_NAME"
--- a/scripts/e2e/parallels-npm-update-smoke.sh
+++ b/scripts/e2e/parallels-npm-update-smoke.sh
@@ -700,6 +700,15 @@ case "\$version" in
    ;;
 esac
 /opt/homebrew/bin/openclaw models set "$MODEL_ID"
+# Same-guest npm upgrades can leave launchd holding the old gateway process or
+# module graph briefly; wait for a fresh RPC-ready restart before the agent turn.
+/opt/homebrew/bin/openclaw gateway restart
+for _ in 1 2 3 4 5 6 7 8; do
+  if /opt/homebrew/bin/openclaw gateway status --deep --require-rpc >/dev/null 2>&1; then
+    break
+  fi
+  sleep 2
+done
 /opt/homebrew/bin/openclaw gateway status --deep --require-rpc
 /usr/bin/env "$API_KEY_ENV=$API_KEY_VALUE" /opt/homebrew/bin/openclaw agent --agent main --session-id parallels-npm-update-macos-$head_short --message "Reply with exact ASCII text OK only." --json
 EOF
--- a/scripts/e2e/parallels-windows-smoke.sh
+++ b/scripts/e2e/parallels-windows-smoke.sh
@@ -445,6 +445,23 @@ EOF
 )"
 }

+ensure_vm_running_for_retry() {
+  local status
+  status="$(prlctl status "$VM_NAME" 2>/dev/null || true)"
+  case "$status" in
+    *" suspended")
+      # Some Windows guest transport drops leave the VM suspended between retry
+      # attempts; wake it before the next prlctl exec.
+      warn "VM suspended during retry path; resuming $VM_NAME"
+      prlctl resume "$VM_NAME" >/dev/null
+      ;;
+    *" stopped")
+      warn "VM stopped during retry path; starting $VM_NAME"
+      prlctl start "$VM_NAME" >/dev/null
+      ;;
+  esac
+}
+
 run_windows_retry() {
  local label="$1"
  local max_attempts="$2"
@@ -463,7 +480,12 @@ run_windows_retry() {
    fi
    warn "$label attempt $attempt failed (rc=$rc)"
    if (( attempt < max_attempts )); then
-      wait_for_guest_ready >/dev/null 2>&1 || true
+      if ! ensure_vm_running_for_retry >/dev/null 2>&1; then
+        :
+      fi
+      if ! wait_for_guest_ready >/dev/null 2>&1; then
+        :
+      fi
      sleep 5
    fi
  done