Files
openclaw/scripts/e2e/multi-node-update-docker.sh
Alex Knight c81271ee6e Fix managed Gateway updates across CLI and service Node skew (#84043)
Summary:
- The PR pins managed Gateway package updates, runtime preflight, post-install doctor, post-core update, service refresh, and restart follow-ups to the Node binary and package root baked into the Gateway service.
- Reproducibility: yes. source-level. Current main validates and follows up with the shell process Node in the ...  body provides a concrete two-Node Docker reproduction, though I did not execute it in this read-only pass.

Automerge notes:
- PR branch already contained follow-up commit before automerge: fix(update): detect service node mismatch even when package roots match
- PR branch already contained follow-up commit before automerge: fix(update): pin package install to service root when nodes differ wi…

Validation:
- ClawSweeper review passed for head 5607e441f6.
- Required merge gates passed before the squash merge.

Prepared head SHA: 5607e441f6
Review: https://github.com/openclaw/openclaw/pull/84043#issuecomment-4485613931

Co-authored-by: Alex Knight <15041791+amknight@users.noreply.github.com>
Co-authored-by: Alex Knight <aknight@atlassian.com>
Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com>
Approved-by: amknight
Co-authored-by: amknight <15041791+amknight@users.noreply.github.com>
2026-05-19 20:44:29 +00:00

405 lines
15 KiB
Bash
Executable File

#!/usr/bin/env bash
# Reproduces the multi-node-install update bug.
#
# Sets up two independent Node installations inside a Docker container, installs
# OpenClaw under node-A, registers the gateway service pointing at node-A, then
# switches PATH so node-B comes first and runs `openclaw update`. Verifies that:
#
# 1. The update targets the wrong install root (node-B npm prefix) or produces
# a gateway service definition pointing at node-B while the package lives
# under node-A.
# 2. The gateway fails to start or runs a stale/missing entrypoint.
#
# Usage:
# ./scripts/e2e/multi-node-update-docker.sh
#
# Requires: Docker, a built openclaw-current.tgz (or will build one).
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "$ROOT_DIR/scripts/lib/docker-e2e-image.sh"
source "$ROOT_DIR/scripts/lib/docker-e2e-package.sh"
IMAGE_NAME="openclaw-multi-node-update-e2e"
DOCKER_RUN_TIMEOUT="${OPENCLAW_MULTI_NODE_DOCKER_TIMEOUT:-300s}"
ARTIFACT_DIR="${OPENCLAW_MULTI_NODE_ARTIFACT_DIR:-$ROOT_DIR/.artifacts/multi-node-update}"
mkdir -p "$ARTIFACT_DIR"
chmod -R a+rwX "$ARTIFACT_DIR" || true
# Build the bare e2e image and prepare the package tarball.
docker_e2e_build_or_reuse "$IMAGE_NAME" multi-node-update "$ROOT_DIR/scripts/e2e/Dockerfile" "$ROOT_DIR" "bare" "${OPENCLAW_SKIP_DOCKER_BUILD:-0}"
PACKAGE_TGZ="$(docker_e2e_prepare_package_tgz multi-node-update "${OPENCLAW_CURRENT_PACKAGE_TGZ:-}")"
docker_e2e_package_mount_args "$PACKAGE_TGZ"
echo "=== Running multi-node-update Docker E2E ==="
CONTAINER_EXIT=0
docker_e2e_run_with_harness \
-e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
-e CI=true \
-e OPENCLAW_NO_ONBOARD=1 \
-e OPENCLAW_NO_PROMPT=1 \
-e OPENCLAW_SKIP_PROVIDERS=1 \
-e OPENCLAW_SKIP_CHANNELS=1 \
-e OPENCLAW_DISABLE_BONJOUR=1 \
-e OPENAI_API_KEY=sk-multi-node-test \
-v "$ARTIFACT_DIR:/tmp/artifacts" \
"${DOCKER_E2E_PACKAGE_ARGS[@]}" \
--user root \
-e HOME=/root \
"$IMAGE_NAME" \
timeout "$DOCKER_RUN_TIMEOUT" bash -lc '
set -euo pipefail
ARTIFACTS=/tmp/artifacts
exec > >(tee "$ARTIFACTS/run.log") 2>&1
echo "========================================"
echo " Multi-Node Update Bug Reproduction"
echo "========================================"
echo ""
# ── Step 1: Create two separate Node installations ──────────────────────
echo "── Step 1: Setting up two Node installations ──"
# node-A is the system node that ships with the Docker image (node:24-bookworm-slim).
NODE_A="$(command -v node)"
NODE_A_DIR="$(dirname "$NODE_A")"
NODE_A_VERSION="$("$NODE_A" --version)"
echo "node-A: $NODE_A ($NODE_A_VERSION)"
# Set up independent npm prefixes.
NPM_PREFIX_A="/opt/npm-prefix-a"
NPM_PREFIX_B="/opt/npm-prefix-b"
mkdir -p "$NPM_PREFIX_A/bin" "$NPM_PREFIX_A/lib" "$NPM_PREFIX_B/bin" "$NPM_PREFIX_B/lib"
# node-B is a second, full Node installation created by copying the entire
# node prefix. This simulates having two real node installs (e.g. Homebrew +
# nvm, or system node + volta).
NODE_B_ROOT="/opt/node-b"
NODE_A_PREFIX="$(dirname "$NODE_A_DIR")"
mkdir -p "$NODE_B_ROOT"
cp -a "$NODE_A_PREFIX/bin" "$NODE_B_ROOT/bin"
cp -a "$NODE_A_PREFIX/lib" "$NODE_B_ROOT/lib"
chmod -R +x "$NODE_B_ROOT/bin/"*
# Configure node-B npm to use its own global prefix (not node-A prefix).
export npm_config_prefix_orig="${npm_config_prefix:-}"
"$NODE_B_ROOT/bin/node" "$NODE_B_ROOT/bin/npm" config set prefix "$NPM_PREFIX_B" --global 2>/dev/null || true
NODE_B="$NODE_B_ROOT/bin/node"
NODE_B_VERSION="$("$NODE_B" --version)"
echo "node-B: $NODE_B ($NODE_B_VERSION)"
echo ""
echo "── Step 2: Install OpenClaw under node-A ──"
# Use node-A to install openclaw with npm prefix A.
export npm_config_prefix="$NPM_PREFIX_A"
export NPM_CONFIG_PREFIX="$NPM_PREFIX_A"
export npm_config_loglevel=error
export npm_config_fund=false
export npm_config_audit=false
export PATH="$NPM_PREFIX_A/bin:$NODE_A_DIR:$PATH"
echo "Installing OpenClaw package under node-A prefix: $NPM_PREFIX_A"
npm install -g /tmp/openclaw-current.tgz --no-fund --no-audit >"$ARTIFACTS/install-a.log" 2>&1
echo "Installed. Checking openclaw location..."
OPENCLAW_A="$(command -v openclaw)"
echo "openclaw binary: $OPENCLAW_A"
echo "openclaw version: $(openclaw --version 2>/dev/null || echo unknown)"
# Record the package root for node-A install.
PACKAGE_ROOT_A="$NPM_PREFIX_A/lib/node_modules/openclaw"
echo "Package root A: $PACKAGE_ROOT_A"
ls -la "$PACKAGE_ROOT_A/package.json" 2>/dev/null || echo "WARNING: package.json not found at A"
echo ""
echo "── Step 3: Install the systemd service (gateway) using node-A ──"
# Create a systemctl shim since we are in Docker (no real systemd).
SHIM_DIR="/usr/local/bin"
GATEWAY_UNIT_PATH="/root/.config/systemd/user/openclaw-gateway.service"
SYSTEMCTL_LOG="$ARTIFACTS/systemctl-shim.log"
GATEWAY_DAEMON_LOG="$ARTIFACTS/gateway-daemon.log"
GATEWAY_PID_FILE="$ARTIFACTS/gateway.pid"
: >"$SYSTEMCTL_LOG"
cat >"$SHIM_DIR/systemctl" <<SHIMEOF
#!/usr/bin/env bash
set -euo pipefail
printf "%s %s\n" "\$(date -u +%H:%M:%S)" "\$*" >>"$SYSTEMCTL_LOG"
filtered=()
for arg in "\$@"; do
case "\$arg" in
--user|--quiet|--no-page|--now) ;;
*) filtered+=("\$arg") ;;
esac
done
command="\${filtered[0]:-status}"
case "\$command" in
daemon-reload)
echo "daemon-reload (shim: no-op)"
;;
enable)
echo "enable (shim: no-op)"
;;
restart|start)
if [ -s "$GATEWAY_PID_FILE" ]; then
old_pid="\$(cat "$GATEWAY_PID_FILE" 2>/dev/null || true)"
if kill -0 "\$old_pid" 2>/dev/null; then
kill "\$old_pid" 2>/dev/null || true
sleep 0.5
fi
fi
unit="$GATEWAY_UNIT_PATH"
if [ ! -f "\$unit" ]; then
echo "systemctl shim: unit not found: \$unit" >&2
exit 1
fi
exec_start="\$(grep "^ExecStart=" "\$unit" | head -1 | sed "s/^ExecStart=//")"
if [ -z "\$exec_start" ]; then
echo "systemctl shim: no ExecStart in \$unit" >&2
exit 1
fi
# Source EnvironmentFile if present
env_file="\$(grep "^EnvironmentFile=" "\$unit" | head -1 | sed "s/^EnvironmentFile=//" | sed "s/^-//")"
if [ -n "\$env_file" ] && [ -f "\$env_file" ]; then
set -a; source "\$env_file"; set +a
fi
# Inline Environment= entries
while IFS= read -r env_line; do
env_entry="\${env_line#Environment=}"
env_entry="\${env_entry#\"}"
env_entry="\${env_entry%\"}"
export "\$env_entry"
done < <(grep "^Environment=" "\$unit" || true)
echo "systemctl shim: starting: \$exec_start"
eval nohup \$exec_start >>"$GATEWAY_DAEMON_LOG" 2>&1 &
echo "\$!" >"$GATEWAY_PID_FILE"
echo "systemctl shim: started pid \$(cat "$GATEWAY_PID_FILE")"
;;
stop)
if [ -s "$GATEWAY_PID_FILE" ]; then
pid="\$(cat "$GATEWAY_PID_FILE")"
kill "\$pid" 2>/dev/null || true
rm -f "$GATEWAY_PID_FILE"
fi
;;
is-active)
if [ -s "$GATEWAY_PID_FILE" ] && kill -0 "\$(cat "$GATEWAY_PID_FILE" 2>/dev/null)" 2>/dev/null; then
echo "active"
else
echo "inactive"
exit 3
fi
;;
show)
echo "ActiveState=inactive"
;;
*)
echo "systemctl shim: ignoring: \$*"
;;
esac
SHIMEOF
chmod +x "$SHIM_DIR/systemctl"
echo "systemctl shim installed."
# Now install the gateway service using node-A.
echo "Installing gateway service..."
mkdir -p "$(dirname "$GATEWAY_UNIT_PATH")"
# gateway install may exit non-zero because our systemctl shim cannot fully
# restart, but the unit file gets written before the restart step.
openclaw gateway install --json >"$ARTIFACTS/gateway-install.json" 2>"$ARTIFACTS/gateway-install.err" || true
echo ""
echo "── Step 4: Inspect what node path was baked into the service ──"
if [ -f "$GATEWAY_UNIT_PATH" ]; then
echo "Service unit contents:"
cat "$GATEWAY_UNIT_PATH" | tee "$ARTIFACTS/unit-before-update.txt"
echo ""
EXEC_START_BEFORE="$(grep "^ExecStart=" "$GATEWAY_UNIT_PATH" | head -1)"
BAKED_NODE_BEFORE="$(echo "$EXEC_START_BEFORE" | sed "s/^ExecStart=//" | awk "{print \$1}")"
echo "Baked node path BEFORE update: $BAKED_NODE_BEFORE"
else
echo "FAIL: Gateway unit file was not created at $GATEWAY_UNIT_PATH"
echo "gateway install output:"
cat "$ARTIFACTS/gateway-install.json" 2>/dev/null || true
cat "$ARTIFACTS/gateway-install.err" 2>/dev/null || true
exit 1
fi
echo ""
echo "── Step 5: Switch PATH so node-B comes first ──"
# Simulate the user scenario: their PATH changes (e.g. they installed
# a second Node via nvm, brew, etc.) and the new node-B comes first.
# Crucially, node-B has its own working npm with its own global prefix,
# but openclaw is NOT installed there.
export PATH="$NPM_PREFIX_B/bin:$NODE_B_ROOT/bin:$NPM_PREFIX_A/bin:$NODE_A_DIR:$PATH"
# Verify node-B npm works independently.
echo "node-B npm prefix: $($NODE_B_ROOT/bin/node $NODE_B_ROOT/bin/npm prefix -g 2>/dev/null || echo unknown)"
echo "which node: $(command -v node)"
echo "which openclaw: $(command -v openclaw)"
echo "process.execPath will be: $(node -e "console.log(process.execPath)")"
echo ""
echo "── Step 6: Run openclaw update (this is the bug) ──"
# Run the update WITH restart so that the update flow re-runs
# `gateway install --force` and bakes the current process.execPath
# (now node-B) into the service unit. This is where the split happens.
echo "Running openclaw update --yes --json..."
UPDATE_EXIT=0
openclaw update --yes --json \
--tag /tmp/openclaw-current.tgz \
>"$ARTIFACTS/update.json" 2>"$ARTIFACTS/update.err" || UPDATE_EXIT=$?
echo ""
echo "Update exit code: $UPDATE_EXIT"
echo "Update stderr (if any):"
cat "$ARTIFACTS/update.err" 2>/dev/null | tail -10 || true
# The update may fail during restart (systemctl shim limitations) but it must
# have at least attempted the package install. Check that it ran past early exit.
if [ "$UPDATE_EXIT" -ne 0 ] && ! grep -q "gateway" "$ARTIFACTS/update.err" 2>/dev/null; then
echo "FAIL: openclaw update failed before reaching the package install step"
cat "$ARTIFACTS/update.err" 2>/dev/null || true
exit 1
fi
echo ""
echo "── Step 7: Inspect the service unit AFTER update ──"
if [ -f "$GATEWAY_UNIT_PATH" ]; then
echo "Service unit contents after update:"
cat "$GATEWAY_UNIT_PATH" | tee "$ARTIFACTS/unit-after-update.txt"
echo ""
EXEC_START_AFTER="$(grep "^ExecStart=" "$GATEWAY_UNIT_PATH" | head -1)"
BAKED_NODE_AFTER="$(echo "$EXEC_START_AFTER" | sed "s/^ExecStart=//" | awk "{print \$1}")"
echo "Baked node path AFTER update: $BAKED_NODE_AFTER"
else
echo "No unit file after update."
fi
echo ""
echo "── Step 8: Verify results ──"
BAKED_NODE_BEFORE="${BAKED_NODE_BEFORE:-unknown}"
BAKED_NODE_AFTER="${BAKED_NODE_AFTER:-unknown}"
echo "Node A: $NODE_A"
echo "Node B: $NODE_B"
echo "Baked BEFORE update: $BAKED_NODE_BEFORE"
echo "Baked AFTER update: $BAKED_NODE_AFTER"
echo "Package root A: $PACKAGE_ROOT_A"
echo ""
# Check 1: Did the baked node path change from A to B?
if [ "$BAKED_NODE_AFTER" = "$NODE_B" ] && [ "$BAKED_NODE_BEFORE" != "$NODE_B" ]; then
echo "BUG CONFIRMED: Gateway service now points at node-B ($NODE_B)"
echo " but OpenClaw package is still under node-A prefix ($PACKAGE_ROOT_A)."
echo " The gateway will use node-B to run an entrypoint that may reference"
echo " node-A dependencies or may not exist under node-B global prefix."
elif [ "$BAKED_NODE_AFTER" = "$BAKED_NODE_BEFORE" ]; then
echo "FIXED: Gateway service still points at the original node ($BAKED_NODE_AFTER)"
else
echo "CHANGED: Node path changed from $BAKED_NODE_BEFORE to $BAKED_NODE_AFTER"
fi
# Check 2: Is the OpenClaw package installed under node-B npm prefix?
if [ -f "$NPM_PREFIX_B/lib/node_modules/openclaw/package.json" ]; then
echo "WARNING: OpenClaw was ALSO installed under node-B prefix (split install)"
else
echo "OK: OpenClaw is NOT under node-B prefix (expected: only under node-A)"
fi
# Check 3: Does the entrypoint in the unit file actually exist?
if [ -f "$GATEWAY_UNIT_PATH" ]; then
EXEC_START_AFTER="$(grep "^ExecStart=" "$GATEWAY_UNIT_PATH" | head -1 | sed "s/^ExecStart=//")"
ENTRYPOINT_PATH="$(echo "$EXEC_START_AFTER" | awk "{print \$2}")"
if [ -n "$ENTRYPOINT_PATH" ] && [ ! -f "$ENTRYPOINT_PATH" ]; then
echo "BUG: Entrypoint in service unit does not exist: $ENTRYPOINT_PATH"
elif [ -n "$ENTRYPOINT_PATH" ]; then
echo "OK: Entrypoint exists: $ENTRYPOINT_PATH"
fi
fi
# Check 4: Were there any warnings about split install in the update output?
if [ -f "$ARTIFACTS/update.err" ]; then
if grep -qi "Shell OpenClaw root differs" "$ARTIFACTS/update.err" 2>/dev/null; then
echo "OK: Update warned about split root"
fi
if grep -qi "Managed gateway service Node" "$ARTIFACTS/update.err" 2>/dev/null; then
echo "OK: Update showed the managed service Node path"
fi
fi
# Check 5: Try to start the gateway and see if it works.
echo ""
echo "── Step 9: Try starting the gateway with the post-update unit ──"
if [ -f "$GATEWAY_UNIT_PATH" ]; then
systemctl restart 2>&1 || true
sleep 3
if [ -s "$GATEWAY_PID_FILE" ] && kill -0 "$(cat "$GATEWAY_PID_FILE" 2>/dev/null)" 2>/dev/null; then
echo "OK: Gateway started (pid $(cat "$GATEWAY_PID_FILE"))"
# Try a health probe.
if openclaw gateway status --json >"$ARTIFACTS/status.json" 2>&1; then
echo "OK: Gateway status probe succeeded"
else
echo "WARNING: Gateway status probe failed"
fi
# Stop it.
kill "$(cat "$GATEWAY_PID_FILE")" 2>/dev/null || true
else
echo "BUG: Gateway failed to start with the post-update unit"
cat "$GATEWAY_DAEMON_LOG" 2>/dev/null | tail -20 || true
fi
fi
echo ""
echo "========================================"
echo " Reproduction complete."
echo " Artifacts saved to /tmp/artifacts/"
echo "========================================"
# ── Final exit code ──────────────────────────────────────────────────────────
# Exit non-zero if any BUG was found, making this usable as a CI gate.
EXIT_CODE=0
if [ "$BAKED_NODE_AFTER" = "$NODE_B" ] && [ "$BAKED_NODE_BEFORE" != "$NODE_B" ]; then
EXIT_CODE=1
fi
if [ -f "$NPM_PREFIX_B/lib/node_modules/openclaw/package.json" ]; then
EXIT_CODE=1
fi
if [ -f "$GATEWAY_UNIT_PATH" ]; then
ENTRYPOINT_PATH_CHECK="$(grep "^ExecStart=" "$GATEWAY_UNIT_PATH" | head -1 | sed "s/^ExecStart=//" | awk "{print \$2}")" || true
if [ -n "$ENTRYPOINT_PATH_CHECK" ] && [ ! -f "$ENTRYPOINT_PATH_CHECK" ]; then
EXIT_CODE=1
fi
fi
exit $EXIT_CODE
' || CONTAINER_EXIT=$?
echo ""
echo "=== Artifacts ==="
echo "Logs saved to: $ARTIFACT_DIR/"
ls -la "$ARTIFACT_DIR/" 2>/dev/null || true
if [ -f "$ARTIFACT_DIR/run.log" ]; then
echo ""
echo "=== Key results ==="
grep -E "^(BUG|FIXED|OK|CHANGED|WARNING)" "$ARTIFACT_DIR/run.log" || echo "(no key results found)"
fi
if [ "$CONTAINER_EXIT" -ne 0 ]; then
echo ""
echo "FAIL: Docker container exited with code $CONTAINER_EXIT"
fi
exit "$CONTAINER_EXIT"