fix(node-runtime): keep node-host recovering after gateway restarts

This commit is contained in:
Peter Steinberger
2026-04-26 07:49:08 +01:00
parent c7ead7d8a9
commit 4ee537a04a
6 changed files with 185 additions and 4 deletions

View File

@@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai
- Gateway/install: refresh loaded gateway service installs when the current service embeds stale gateway auth instead of returning already-installed, avoiding LaunchAgent token-mismatch loops after token rotation. Fixes #70752. Thanks @hyspacex.
- Update: ignore bundled plugin `.openclaw-install-stage` directories during global install verification and packaged dist pruning so leftover runtime-dep staging files do not turn successful updates into `unexpected packaged dist file` failures. Fixes #71752. Thanks @waynegault.
- Node runtime: keep node-host retry timers alive across Gateway restarts and exit on terminal credential pauses so supervised nodes do not become silent zombies. Fixes #69800. Thanks @meroli28.
- Gateway/plugins: stop persisted WhatsApp auth state from activating bundled channel runtime-dependency repair during startup when `channels.whatsapp` is absent, avoiding npm/git stalls on packaged Linux installs. Fixes #71994. Thanks @xiao398008.
- Gateway/device tokens: enforce caller-scope containment inside token rotation and revocation so pairing-only sessions cannot mutate higher-scope operator tokens. Fixes #71990. Thanks @coygeek.
- CLI/model runs: keep `openclaw infer model run` on explicit OpenRouter models from loading the full provider catalog or inheriting chat-agent silent-reply policy, restoring non-empty one-shot probe output. Fixes #68791. Thanks @limpredator.

View File

@@ -114,6 +114,12 @@ Use `openclaw node run` for a foreground node host (no service).
Service commands accept `--json` for machine-readable output.
The node host retries Gateway restart and network closes in-process. If the
Gateway reports a terminal token/password/bootstrap auth pause, the node host
logs the close detail and exits non-zero so launchd/systemd can restart it with
fresh config and credentials. Pairing-required pauses stay in the foreground
flow so the pending request can be approved.
## Pairing
The first connection creates a pending device pairing request (`role: node`) on the Gateway.

View File

@@ -395,6 +395,48 @@ describe("GatewayClient close handling", () => {
client.stop();
});
it("keeps a managed reconnect timer after gateway restart closes", async () => {
vi.useFakeTimers();
try {
const client = new GatewayClient({
url: "ws://127.0.0.1:18789",
});
client.start();
getLatestWs().emitClose(1012, "service restart");
expect(wsInstances).toHaveLength(1);
await vi.advanceTimersByTimeAsync(999);
expect(wsInstances).toHaveLength(1);
await vi.advanceTimersByTimeAsync(1);
expect(wsInstances).toHaveLength(2);
client.stop();
} finally {
vi.useRealTimers();
}
});
it("clears pending reconnect timers on stop", async () => {
vi.useFakeTimers();
try {
const client = new GatewayClient({
url: "ws://127.0.0.1:18789",
});
client.start();
getLatestWs().emitClose(1012, "service restart");
client.stop();
await vi.advanceTimersByTimeAsync(30_000);
expect(wsInstances).toHaveLength(1);
} finally {
vi.useRealTimers();
}
});
it("force-terminates a lingering socket after stop", async () => {
vi.useFakeTimers();
try {
@@ -827,9 +869,11 @@ describe("GatewayClient connect auth payload", () => {
});
it("does not auto-reconnect on AUTH_TOKEN_MISSING connect failures", async () => {
const onReconnectPaused = vi.fn();
const client = new GatewayClient({
url: "ws://127.0.0.1:18789",
token: "shared-token",
onReconnectPaused,
});
const { ws: ws1, connect: firstConnect } = startClientAndConnect({ client });
@@ -839,6 +883,11 @@ describe("GatewayClient connect auth payload", () => {
connectId: firstConnect.id,
failureDetails: { code: "AUTH_TOKEN_MISSING" },
});
expect(onReconnectPaused).toHaveBeenCalledWith({
code: 1008,
reason: "connect failed",
detailCode: "AUTH_TOKEN_MISSING",
});
});
it("does not auto-reconnect on token mismatch when retry is not trusted", async () => {

View File

@@ -83,6 +83,12 @@ type FingerprintCheckingClientOptions = Omit<ClientOptions, "checkServerIdentity
checkServerIdentity?: (servername: string, cert: CertMeta) => Error | undefined;
};
export type GatewayReconnectPausedInfo = {
code: number;
reason: string;
detailCode: string | null;
};
export class GatewayClientRequestError extends Error {
readonly gatewayCode: string;
readonly details?: unknown;
@@ -130,6 +136,7 @@ export type GatewayClientOptions = {
onEvent?: (evt: EventFrame) => void;
onHelloOk?: (hello: HelloOk) => void;
onConnectError?: (err: Error) => void;
onReconnectPaused?: (info: GatewayReconnectPausedInfo) => void;
onClose?: (code: number, reason: string) => void;
onGap?: (info: { expected: number; received: number }) => void;
};
@@ -190,6 +197,7 @@ export class GatewayClient {
private connectNonce: string | null = null;
private connectSent = false;
private connectTimer: NodeJS.Timeout | null = null;
private reconnectTimer: NodeJS.Timeout | null = null;
private pendingDeviceTokenRetry = false;
private deviceTokenRetryBudgetUsed = false;
private pendingConnectErrorDetailCode: string | null = null;
@@ -219,6 +227,7 @@ export class GatewayClient {
if (this.closed) {
return;
}
this.clearReconnectTimer();
this.clearConnectChallengeTimeout();
this.connectNonce = null;
this.connectSent = false;
@@ -332,6 +341,11 @@ export class GatewayClient {
}
this.flushPendingErrors(new Error(`gateway closed (${code}): ${reasonText}`));
if (this.shouldPauseReconnectAfterAuthFailure(connectErrorDetailCode)) {
this.opts.onReconnectPaused?.({
code,
reason: reasonText,
detailCode: connectErrorDetailCode,
});
this.opts.onClose?.(code, reasonText);
return;
}
@@ -384,6 +398,7 @@ export class GatewayClient {
this.pendingDeviceTokenRetry = false;
this.deviceTokenRetryBudgetUsed = false;
this.pendingConnectErrorDetailCode = null;
this.clearReconnectTimer();
if (this.tickTimer) {
clearInterval(this.tickTimer);
this.tickTimer = null;
@@ -817,6 +832,13 @@ export class GatewayClient {
}
}
private clearReconnectTimer() {
if (this.reconnectTimer) {
clearTimeout(this.reconnectTimer);
this.reconnectTimer = null;
}
}
private armConnectChallengeTimeout() {
const connectChallengeTimeoutMs = resolveGatewayClientConnectChallengeTimeoutMs(this.opts);
const armedAt = Date.now();
@@ -843,9 +865,13 @@ export class GatewayClient {
clearInterval(this.tickTimer);
this.tickTimer = null;
}
this.clearReconnectTimer();
const delay = this.backoffMs;
this.backoffMs = Math.min(this.backoffMs * 2, 30_000);
setTimeout(() => this.start(), delay).unref();
this.reconnectTimer = setTimeout(() => {
this.reconnectTimer = null;
this.start();
}, delay);
}
private flushPendingErrors(err: Error) {

View File

@@ -1,7 +1,12 @@
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js";
import { withEnvAsync } from "../test-utils/env.js";
import { resolveNodeHostGatewayCredentials } from "./runner.js";
import {
handleNodeHostReconnectPaused,
resolveNodeHostGatewayCredentials,
shouldExitNodeHostOnReconnectPaused,
} from "./runner.js";
function createRemoteGatewayTokenRefConfig(tokenId: string): OpenClawConfig {
return {
@@ -147,3 +152,52 @@ describe("resolveNodeHostGatewayCredentials", () => {
);
});
});
describe("handleNodeHostReconnectPaused", () => {
it("exits for terminal credential pauses so service supervisors can restart", () => {
const lines: string[] = [];
const exit = vi.fn((code: number) => {
throw new Error(`exit ${code}`);
}) as (code: number) => never;
expect(() =>
handleNodeHostReconnectPaused(
{
code: 1008,
reason: "connect failed",
detailCode: ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH,
},
{ writeLine: (line) => lines.push(line), exit },
),
).toThrow("exit 1");
expect(exit).toHaveBeenCalledWith(1);
expect(lines).toEqual([
"node host gateway reconnect paused after close (1008): connect failed detail=AUTH_TOKEN_MISMATCH; exiting for supervisor restart",
]);
});
it("keeps pairing pauses visible without exiting foreground approval flow", () => {
const lines: string[] = [];
const exit = vi.fn((code: number) => {
throw new Error(`exit ${code}`);
}) as (code: number) => never;
handleNodeHostReconnectPaused(
{
code: 1008,
reason: "connect failed",
detailCode: ConnectErrorDetailCodes.PAIRING_REQUIRED,
},
{ writeLine: (line) => lines.push(line), exit },
);
expect(shouldExitNodeHostOnReconnectPaused(ConnectErrorDetailCodes.PAIRING_REQUIRED)).toBe(
false,
);
expect(exit).not.toHaveBeenCalled();
expect(lines).toEqual([
"node host gateway reconnect paused after close (1008): connect failed detail=PAIRING_REQUIRED; waiting for operator action",
]);
});
});

View File

@@ -1,7 +1,8 @@
import { loadConfig, type OpenClawConfig } from "../config/config.js";
import { GatewayClient } from "../gateway/client.js";
import { GatewayClient, type GatewayReconnectPausedInfo } from "../gateway/client.js";
import { resolveGatewayConnectionAuth } from "../gateway/connection-auth.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js";
import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js";
import { loadOrCreateDeviceIdentity } from "../infra/device-identity.js";
import type { SkillBinTrustEntry } from "../infra/exec-approvals.js";
import { resolveExecutableFromPathEnv } from "../infra/executable-path.js";
@@ -38,6 +39,47 @@ function writeStderrLine(message: string): void {
process.stderr.write(`${message}\n`);
}
const NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES: ReadonlySet<string> = new Set([
ConnectErrorDetailCodes.AUTH_TOKEN_MISSING,
ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH,
ConnectErrorDetailCodes.AUTH_BOOTSTRAP_TOKEN_INVALID,
ConnectErrorDetailCodes.AUTH_PASSWORD_MISSING,
ConnectErrorDetailCodes.AUTH_PASSWORD_MISMATCH,
]);
type NodeHostReconnectPausedDeps = {
writeLine?: (message: string) => void;
exit?: (code: number) => never;
};
export function shouldExitNodeHostOnReconnectPaused(detailCode: string | null): boolean {
return detailCode !== null && NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES.has(detailCode);
}
export function formatNodeHostReconnectPausedMessage(
info: GatewayReconnectPausedInfo,
params?: { exiting?: boolean },
): string {
const detail = info.detailCode ? ` detail=${info.detailCode}` : "";
const reason = info.reason.trim() || "no close reason";
const action = params?.exiting ? "exiting for supervisor restart" : "waiting for operator action";
return `node host gateway reconnect paused after close (${info.code}): ${reason}${detail}; ${action}`;
}
export function handleNodeHostReconnectPaused(
info: GatewayReconnectPausedInfo,
deps: NodeHostReconnectPausedDeps = {},
): void {
const shouldExit = shouldExitNodeHostOnReconnectPaused(info.detailCode);
const writeLine = deps.writeLine ?? writeStderrLine;
writeLine(formatNodeHostReconnectPausedMessage(info, { exiting: shouldExit }));
if (!shouldExit) {
return;
}
const exit = deps.exit ?? ((code: number): never => process.exit(code));
exit(1);
}
function resolveExecutablePathFromEnv(bin: string, pathEnv: string): string | null {
if (bin.includes("/") || bin.includes("\\")) {
return null;
@@ -212,6 +254,9 @@ export async function runNodeHost(opts: NodeHostRunOptions): Promise<void> {
// keep retrying (handled by GatewayClient)
writeStderrLine(`node host gateway connect failed: ${err.message}`);
},
onReconnectPaused: (info) => {
handleNodeHostReconnectPaused(info);
},
onClose: (code, reason) => {
writeStderrLine(`node host gateway closed (${code}): ${reason}`);
},