From 4ee537a04ad00d70c0d2288dec7d23f910e81bf0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 26 Apr 2026 07:49:08 +0100 Subject: [PATCH] fix(node-runtime): keep node-host recovering after gateway restarts --- CHANGELOG.md | 1 + docs/cli/node.md | 6 +++ src/gateway/client.test.ts | 49 ++++++++++++++++++++ src/gateway/client.ts | 28 +++++++++++- src/node-host/runner.credentials.test.ts | 58 +++++++++++++++++++++++- src/node-host/runner.ts | 47 ++++++++++++++++++- 6 files changed, 185 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bf66995b17..c8e4ac83b66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai - Gateway/install: refresh loaded gateway service installs when the current service embeds stale gateway auth instead of returning already-installed, avoiding LaunchAgent token-mismatch loops after token rotation. Fixes #70752. Thanks @hyspacex. - Update: ignore bundled plugin `.openclaw-install-stage` directories during global install verification and packaged dist pruning so leftover runtime-dep staging files do not turn successful updates into `unexpected packaged dist file` failures. Fixes #71752. Thanks @waynegault. +- Node runtime: keep node-host retry timers alive across Gateway restarts and exit on terminal credential pauses so supervised nodes do not become silent zombies. Fixes #69800. Thanks @meroli28. - Gateway/plugins: stop persisted WhatsApp auth state from activating bundled channel runtime-dependency repair during startup when `channels.whatsapp` is absent, avoiding npm/git stalls on packaged Linux installs. Fixes #71994. Thanks @xiao398008. - Gateway/device tokens: enforce caller-scope containment inside token rotation and revocation so pairing-only sessions cannot mutate higher-scope operator tokens. Fixes #71990. Thanks @coygeek. - CLI/model runs: keep `openclaw infer model run` on explicit OpenRouter models from loading the full provider catalog or inheriting chat-agent silent-reply policy, restoring non-empty one-shot probe output. Fixes #68791. Thanks @limpredator. diff --git a/docs/cli/node.md b/docs/cli/node.md index 5ca50cd8937..c604f6dca8c 100644 --- a/docs/cli/node.md +++ b/docs/cli/node.md @@ -114,6 +114,12 @@ Use `openclaw node run` for a foreground node host (no service). Service commands accept `--json` for machine-readable output. +The node host retries Gateway restart and network closes in-process. If the +Gateway reports a terminal token/password/bootstrap auth pause, the node host +logs the close detail and exits non-zero so launchd/systemd can restart it with +fresh config and credentials. Pairing-required pauses stay in the foreground +flow so the pending request can be approved. + ## Pairing The first connection creates a pending device pairing request (`role: node`) on the Gateway. diff --git a/src/gateway/client.test.ts b/src/gateway/client.test.ts index 3f5594e66bb..68d459bbc0d 100644 --- a/src/gateway/client.test.ts +++ b/src/gateway/client.test.ts @@ -395,6 +395,48 @@ describe("GatewayClient close handling", () => { client.stop(); }); + it("keeps a managed reconnect timer after gateway restart closes", async () => { + vi.useFakeTimers(); + try { + const client = new GatewayClient({ + url: "ws://127.0.0.1:18789", + }); + + client.start(); + getLatestWs().emitClose(1012, "service restart"); + + expect(wsInstances).toHaveLength(1); + await vi.advanceTimersByTimeAsync(999); + expect(wsInstances).toHaveLength(1); + + await vi.advanceTimersByTimeAsync(1); + + expect(wsInstances).toHaveLength(2); + client.stop(); + } finally { + vi.useRealTimers(); + } + }); + + it("clears pending reconnect timers on stop", async () => { + vi.useFakeTimers(); + try { + const client = new GatewayClient({ + url: "ws://127.0.0.1:18789", + }); + + client.start(); + getLatestWs().emitClose(1012, "service restart"); + client.stop(); + + await vi.advanceTimersByTimeAsync(30_000); + + expect(wsInstances).toHaveLength(1); + } finally { + vi.useRealTimers(); + } + }); + it("force-terminates a lingering socket after stop", async () => { vi.useFakeTimers(); try { @@ -827,9 +869,11 @@ describe("GatewayClient connect auth payload", () => { }); it("does not auto-reconnect on AUTH_TOKEN_MISSING connect failures", async () => { + const onReconnectPaused = vi.fn(); const client = new GatewayClient({ url: "ws://127.0.0.1:18789", token: "shared-token", + onReconnectPaused, }); const { ws: ws1, connect: firstConnect } = startClientAndConnect({ client }); @@ -839,6 +883,11 @@ describe("GatewayClient connect auth payload", () => { connectId: firstConnect.id, failureDetails: { code: "AUTH_TOKEN_MISSING" }, }); + expect(onReconnectPaused).toHaveBeenCalledWith({ + code: 1008, + reason: "connect failed", + detailCode: "AUTH_TOKEN_MISSING", + }); }); it("does not auto-reconnect on token mismatch when retry is not trusted", async () => { diff --git a/src/gateway/client.ts b/src/gateway/client.ts index 8cb2bcdf5b9..d3d6ebb5c51 100644 --- a/src/gateway/client.ts +++ b/src/gateway/client.ts @@ -83,6 +83,12 @@ type FingerprintCheckingClientOptions = Omit Error | undefined; }; +export type GatewayReconnectPausedInfo = { + code: number; + reason: string; + detailCode: string | null; +}; + export class GatewayClientRequestError extends Error { readonly gatewayCode: string; readonly details?: unknown; @@ -130,6 +136,7 @@ export type GatewayClientOptions = { onEvent?: (evt: EventFrame) => void; onHelloOk?: (hello: HelloOk) => void; onConnectError?: (err: Error) => void; + onReconnectPaused?: (info: GatewayReconnectPausedInfo) => void; onClose?: (code: number, reason: string) => void; onGap?: (info: { expected: number; received: number }) => void; }; @@ -190,6 +197,7 @@ export class GatewayClient { private connectNonce: string | null = null; private connectSent = false; private connectTimer: NodeJS.Timeout | null = null; + private reconnectTimer: NodeJS.Timeout | null = null; private pendingDeviceTokenRetry = false; private deviceTokenRetryBudgetUsed = false; private pendingConnectErrorDetailCode: string | null = null; @@ -219,6 +227,7 @@ export class GatewayClient { if (this.closed) { return; } + this.clearReconnectTimer(); this.clearConnectChallengeTimeout(); this.connectNonce = null; this.connectSent = false; @@ -332,6 +341,11 @@ export class GatewayClient { } this.flushPendingErrors(new Error(`gateway closed (${code}): ${reasonText}`)); if (this.shouldPauseReconnectAfterAuthFailure(connectErrorDetailCode)) { + this.opts.onReconnectPaused?.({ + code, + reason: reasonText, + detailCode: connectErrorDetailCode, + }); this.opts.onClose?.(code, reasonText); return; } @@ -384,6 +398,7 @@ export class GatewayClient { this.pendingDeviceTokenRetry = false; this.deviceTokenRetryBudgetUsed = false; this.pendingConnectErrorDetailCode = null; + this.clearReconnectTimer(); if (this.tickTimer) { clearInterval(this.tickTimer); this.tickTimer = null; @@ -817,6 +832,13 @@ export class GatewayClient { } } + private clearReconnectTimer() { + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } + } + private armConnectChallengeTimeout() { const connectChallengeTimeoutMs = resolveGatewayClientConnectChallengeTimeoutMs(this.opts); const armedAt = Date.now(); @@ -843,9 +865,13 @@ export class GatewayClient { clearInterval(this.tickTimer); this.tickTimer = null; } + this.clearReconnectTimer(); const delay = this.backoffMs; this.backoffMs = Math.min(this.backoffMs * 2, 30_000); - setTimeout(() => this.start(), delay).unref(); + this.reconnectTimer = setTimeout(() => { + this.reconnectTimer = null; + this.start(); + }, delay); } private flushPendingErrors(err: Error) { diff --git a/src/node-host/runner.credentials.test.ts b/src/node-host/runner.credentials.test.ts index c7a04951130..20ce191500b 100644 --- a/src/node-host/runner.credentials.test.ts +++ b/src/node-host/runner.credentials.test.ts @@ -1,7 +1,12 @@ -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../config/config.js"; +import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js"; import { withEnvAsync } from "../test-utils/env.js"; -import { resolveNodeHostGatewayCredentials } from "./runner.js"; +import { + handleNodeHostReconnectPaused, + resolveNodeHostGatewayCredentials, + shouldExitNodeHostOnReconnectPaused, +} from "./runner.js"; function createRemoteGatewayTokenRefConfig(tokenId: string): OpenClawConfig { return { @@ -147,3 +152,52 @@ describe("resolveNodeHostGatewayCredentials", () => { ); }); }); + +describe("handleNodeHostReconnectPaused", () => { + it("exits for terminal credential pauses so service supervisors can restart", () => { + const lines: string[] = []; + const exit = vi.fn((code: number) => { + throw new Error(`exit ${code}`); + }) as (code: number) => never; + + expect(() => + handleNodeHostReconnectPaused( + { + code: 1008, + reason: "connect failed", + detailCode: ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH, + }, + { writeLine: (line) => lines.push(line), exit }, + ), + ).toThrow("exit 1"); + + expect(exit).toHaveBeenCalledWith(1); + expect(lines).toEqual([ + "node host gateway reconnect paused after close (1008): connect failed detail=AUTH_TOKEN_MISMATCH; exiting for supervisor restart", + ]); + }); + + it("keeps pairing pauses visible without exiting foreground approval flow", () => { + const lines: string[] = []; + const exit = vi.fn((code: number) => { + throw new Error(`exit ${code}`); + }) as (code: number) => never; + + handleNodeHostReconnectPaused( + { + code: 1008, + reason: "connect failed", + detailCode: ConnectErrorDetailCodes.PAIRING_REQUIRED, + }, + { writeLine: (line) => lines.push(line), exit }, + ); + + expect(shouldExitNodeHostOnReconnectPaused(ConnectErrorDetailCodes.PAIRING_REQUIRED)).toBe( + false, + ); + expect(exit).not.toHaveBeenCalled(); + expect(lines).toEqual([ + "node host gateway reconnect paused after close (1008): connect failed detail=PAIRING_REQUIRED; waiting for operator action", + ]); + }); +}); diff --git a/src/node-host/runner.ts b/src/node-host/runner.ts index 0fe3772fefb..82ee7624f2b 100644 --- a/src/node-host/runner.ts +++ b/src/node-host/runner.ts @@ -1,7 +1,8 @@ import { loadConfig, type OpenClawConfig } from "../config/config.js"; -import { GatewayClient } from "../gateway/client.js"; +import { GatewayClient, type GatewayReconnectPausedInfo } from "../gateway/client.js"; import { resolveGatewayConnectionAuth } from "../gateway/connection-auth.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js"; +import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js"; import { loadOrCreateDeviceIdentity } from "../infra/device-identity.js"; import type { SkillBinTrustEntry } from "../infra/exec-approvals.js"; import { resolveExecutableFromPathEnv } from "../infra/executable-path.js"; @@ -38,6 +39,47 @@ function writeStderrLine(message: string): void { process.stderr.write(`${message}\n`); } +const NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES: ReadonlySet = new Set([ + ConnectErrorDetailCodes.AUTH_TOKEN_MISSING, + ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH, + ConnectErrorDetailCodes.AUTH_BOOTSTRAP_TOKEN_INVALID, + ConnectErrorDetailCodes.AUTH_PASSWORD_MISSING, + ConnectErrorDetailCodes.AUTH_PASSWORD_MISMATCH, +]); + +type NodeHostReconnectPausedDeps = { + writeLine?: (message: string) => void; + exit?: (code: number) => never; +}; + +export function shouldExitNodeHostOnReconnectPaused(detailCode: string | null): boolean { + return detailCode !== null && NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES.has(detailCode); +} + +export function formatNodeHostReconnectPausedMessage( + info: GatewayReconnectPausedInfo, + params?: { exiting?: boolean }, +): string { + const detail = info.detailCode ? ` detail=${info.detailCode}` : ""; + const reason = info.reason.trim() || "no close reason"; + const action = params?.exiting ? "exiting for supervisor restart" : "waiting for operator action"; + return `node host gateway reconnect paused after close (${info.code}): ${reason}${detail}; ${action}`; +} + +export function handleNodeHostReconnectPaused( + info: GatewayReconnectPausedInfo, + deps: NodeHostReconnectPausedDeps = {}, +): void { + const shouldExit = shouldExitNodeHostOnReconnectPaused(info.detailCode); + const writeLine = deps.writeLine ?? writeStderrLine; + writeLine(formatNodeHostReconnectPausedMessage(info, { exiting: shouldExit })); + if (!shouldExit) { + return; + } + const exit = deps.exit ?? ((code: number): never => process.exit(code)); + exit(1); +} + function resolveExecutablePathFromEnv(bin: string, pathEnv: string): string | null { if (bin.includes("/") || bin.includes("\\")) { return null; @@ -212,6 +254,9 @@ export async function runNodeHost(opts: NodeHostRunOptions): Promise { // keep retrying (handled by GatewayClient) writeStderrLine(`node host gateway connect failed: ${err.message}`); }, + onReconnectPaused: (info) => { + handleNodeHostReconnectPaused(info); + }, onClose: (code, reason) => { writeStderrLine(`node host gateway closed (${code}): ${reason}`); },