mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 06:20:43 +00:00
fix(node-runtime): keep node-host recovering after gateway restarts
This commit is contained in:
@@ -78,6 +78,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
- Gateway/install: refresh loaded gateway service installs when the current service embeds stale gateway auth instead of returning already-installed, avoiding LaunchAgent token-mismatch loops after token rotation. Fixes #70752. Thanks @hyspacex.
|
||||
- Update: ignore bundled plugin `.openclaw-install-stage` directories during global install verification and packaged dist pruning so leftover runtime-dep staging files do not turn successful updates into `unexpected packaged dist file` failures. Fixes #71752. Thanks @waynegault.
|
||||
- Node runtime: keep node-host retry timers alive across Gateway restarts and exit on terminal credential pauses so supervised nodes do not become silent zombies. Fixes #69800. Thanks @meroli28.
|
||||
- Gateway/plugins: stop persisted WhatsApp auth state from activating bundled channel runtime-dependency repair during startup when `channels.whatsapp` is absent, avoiding npm/git stalls on packaged Linux installs. Fixes #71994. Thanks @xiao398008.
|
||||
- Gateway/device tokens: enforce caller-scope containment inside token rotation and revocation so pairing-only sessions cannot mutate higher-scope operator tokens. Fixes #71990. Thanks @coygeek.
|
||||
- CLI/model runs: keep `openclaw infer model run` on explicit OpenRouter models from loading the full provider catalog or inheriting chat-agent silent-reply policy, restoring non-empty one-shot probe output. Fixes #68791. Thanks @limpredator.
|
||||
|
||||
@@ -114,6 +114,12 @@ Use `openclaw node run` for a foreground node host (no service).
|
||||
|
||||
Service commands accept `--json` for machine-readable output.
|
||||
|
||||
The node host retries Gateway restart and network closes in-process. If the
|
||||
Gateway reports a terminal token/password/bootstrap auth pause, the node host
|
||||
logs the close detail and exits non-zero so launchd/systemd can restart it with
|
||||
fresh config and credentials. Pairing-required pauses stay in the foreground
|
||||
flow so the pending request can be approved.
|
||||
|
||||
## Pairing
|
||||
|
||||
The first connection creates a pending device pairing request (`role: node`) on the Gateway.
|
||||
|
||||
@@ -395,6 +395,48 @@ describe("GatewayClient close handling", () => {
|
||||
client.stop();
|
||||
});
|
||||
|
||||
it("keeps a managed reconnect timer after gateway restart closes", async () => {
|
||||
vi.useFakeTimers();
|
||||
try {
|
||||
const client = new GatewayClient({
|
||||
url: "ws://127.0.0.1:18789",
|
||||
});
|
||||
|
||||
client.start();
|
||||
getLatestWs().emitClose(1012, "service restart");
|
||||
|
||||
expect(wsInstances).toHaveLength(1);
|
||||
await vi.advanceTimersByTimeAsync(999);
|
||||
expect(wsInstances).toHaveLength(1);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(1);
|
||||
|
||||
expect(wsInstances).toHaveLength(2);
|
||||
client.stop();
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
});
|
||||
|
||||
it("clears pending reconnect timers on stop", async () => {
|
||||
vi.useFakeTimers();
|
||||
try {
|
||||
const client = new GatewayClient({
|
||||
url: "ws://127.0.0.1:18789",
|
||||
});
|
||||
|
||||
client.start();
|
||||
getLatestWs().emitClose(1012, "service restart");
|
||||
client.stop();
|
||||
|
||||
await vi.advanceTimersByTimeAsync(30_000);
|
||||
|
||||
expect(wsInstances).toHaveLength(1);
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
});
|
||||
|
||||
it("force-terminates a lingering socket after stop", async () => {
|
||||
vi.useFakeTimers();
|
||||
try {
|
||||
@@ -827,9 +869,11 @@ describe("GatewayClient connect auth payload", () => {
|
||||
});
|
||||
|
||||
it("does not auto-reconnect on AUTH_TOKEN_MISSING connect failures", async () => {
|
||||
const onReconnectPaused = vi.fn();
|
||||
const client = new GatewayClient({
|
||||
url: "ws://127.0.0.1:18789",
|
||||
token: "shared-token",
|
||||
onReconnectPaused,
|
||||
});
|
||||
|
||||
const { ws: ws1, connect: firstConnect } = startClientAndConnect({ client });
|
||||
@@ -839,6 +883,11 @@ describe("GatewayClient connect auth payload", () => {
|
||||
connectId: firstConnect.id,
|
||||
failureDetails: { code: "AUTH_TOKEN_MISSING" },
|
||||
});
|
||||
expect(onReconnectPaused).toHaveBeenCalledWith({
|
||||
code: 1008,
|
||||
reason: "connect failed",
|
||||
detailCode: "AUTH_TOKEN_MISSING",
|
||||
});
|
||||
});
|
||||
|
||||
it("does not auto-reconnect on token mismatch when retry is not trusted", async () => {
|
||||
|
||||
@@ -83,6 +83,12 @@ type FingerprintCheckingClientOptions = Omit<ClientOptions, "checkServerIdentity
|
||||
checkServerIdentity?: (servername: string, cert: CertMeta) => Error | undefined;
|
||||
};
|
||||
|
||||
export type GatewayReconnectPausedInfo = {
|
||||
code: number;
|
||||
reason: string;
|
||||
detailCode: string | null;
|
||||
};
|
||||
|
||||
export class GatewayClientRequestError extends Error {
|
||||
readonly gatewayCode: string;
|
||||
readonly details?: unknown;
|
||||
@@ -130,6 +136,7 @@ export type GatewayClientOptions = {
|
||||
onEvent?: (evt: EventFrame) => void;
|
||||
onHelloOk?: (hello: HelloOk) => void;
|
||||
onConnectError?: (err: Error) => void;
|
||||
onReconnectPaused?: (info: GatewayReconnectPausedInfo) => void;
|
||||
onClose?: (code: number, reason: string) => void;
|
||||
onGap?: (info: { expected: number; received: number }) => void;
|
||||
};
|
||||
@@ -190,6 +197,7 @@ export class GatewayClient {
|
||||
private connectNonce: string | null = null;
|
||||
private connectSent = false;
|
||||
private connectTimer: NodeJS.Timeout | null = null;
|
||||
private reconnectTimer: NodeJS.Timeout | null = null;
|
||||
private pendingDeviceTokenRetry = false;
|
||||
private deviceTokenRetryBudgetUsed = false;
|
||||
private pendingConnectErrorDetailCode: string | null = null;
|
||||
@@ -219,6 +227,7 @@ export class GatewayClient {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
this.clearReconnectTimer();
|
||||
this.clearConnectChallengeTimeout();
|
||||
this.connectNonce = null;
|
||||
this.connectSent = false;
|
||||
@@ -332,6 +341,11 @@ export class GatewayClient {
|
||||
}
|
||||
this.flushPendingErrors(new Error(`gateway closed (${code}): ${reasonText}`));
|
||||
if (this.shouldPauseReconnectAfterAuthFailure(connectErrorDetailCode)) {
|
||||
this.opts.onReconnectPaused?.({
|
||||
code,
|
||||
reason: reasonText,
|
||||
detailCode: connectErrorDetailCode,
|
||||
});
|
||||
this.opts.onClose?.(code, reasonText);
|
||||
return;
|
||||
}
|
||||
@@ -384,6 +398,7 @@ export class GatewayClient {
|
||||
this.pendingDeviceTokenRetry = false;
|
||||
this.deviceTokenRetryBudgetUsed = false;
|
||||
this.pendingConnectErrorDetailCode = null;
|
||||
this.clearReconnectTimer();
|
||||
if (this.tickTimer) {
|
||||
clearInterval(this.tickTimer);
|
||||
this.tickTimer = null;
|
||||
@@ -817,6 +832,13 @@ export class GatewayClient {
|
||||
}
|
||||
}
|
||||
|
||||
private clearReconnectTimer() {
|
||||
if (this.reconnectTimer) {
|
||||
clearTimeout(this.reconnectTimer);
|
||||
this.reconnectTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
private armConnectChallengeTimeout() {
|
||||
const connectChallengeTimeoutMs = resolveGatewayClientConnectChallengeTimeoutMs(this.opts);
|
||||
const armedAt = Date.now();
|
||||
@@ -843,9 +865,13 @@ export class GatewayClient {
|
||||
clearInterval(this.tickTimer);
|
||||
this.tickTimer = null;
|
||||
}
|
||||
this.clearReconnectTimer();
|
||||
const delay = this.backoffMs;
|
||||
this.backoffMs = Math.min(this.backoffMs * 2, 30_000);
|
||||
setTimeout(() => this.start(), delay).unref();
|
||||
this.reconnectTimer = setTimeout(() => {
|
||||
this.reconnectTimer = null;
|
||||
this.start();
|
||||
}, delay);
|
||||
}
|
||||
|
||||
private flushPendingErrors(err: Error) {
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js";
|
||||
import { withEnvAsync } from "../test-utils/env.js";
|
||||
import { resolveNodeHostGatewayCredentials } from "./runner.js";
|
||||
import {
|
||||
handleNodeHostReconnectPaused,
|
||||
resolveNodeHostGatewayCredentials,
|
||||
shouldExitNodeHostOnReconnectPaused,
|
||||
} from "./runner.js";
|
||||
|
||||
function createRemoteGatewayTokenRefConfig(tokenId: string): OpenClawConfig {
|
||||
return {
|
||||
@@ -147,3 +152,52 @@ describe("resolveNodeHostGatewayCredentials", () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("handleNodeHostReconnectPaused", () => {
|
||||
it("exits for terminal credential pauses so service supervisors can restart", () => {
|
||||
const lines: string[] = [];
|
||||
const exit = vi.fn((code: number) => {
|
||||
throw new Error(`exit ${code}`);
|
||||
}) as (code: number) => never;
|
||||
|
||||
expect(() =>
|
||||
handleNodeHostReconnectPaused(
|
||||
{
|
||||
code: 1008,
|
||||
reason: "connect failed",
|
||||
detailCode: ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH,
|
||||
},
|
||||
{ writeLine: (line) => lines.push(line), exit },
|
||||
),
|
||||
).toThrow("exit 1");
|
||||
|
||||
expect(exit).toHaveBeenCalledWith(1);
|
||||
expect(lines).toEqual([
|
||||
"node host gateway reconnect paused after close (1008): connect failed detail=AUTH_TOKEN_MISMATCH; exiting for supervisor restart",
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps pairing pauses visible without exiting foreground approval flow", () => {
|
||||
const lines: string[] = [];
|
||||
const exit = vi.fn((code: number) => {
|
||||
throw new Error(`exit ${code}`);
|
||||
}) as (code: number) => never;
|
||||
|
||||
handleNodeHostReconnectPaused(
|
||||
{
|
||||
code: 1008,
|
||||
reason: "connect failed",
|
||||
detailCode: ConnectErrorDetailCodes.PAIRING_REQUIRED,
|
||||
},
|
||||
{ writeLine: (line) => lines.push(line), exit },
|
||||
);
|
||||
|
||||
expect(shouldExitNodeHostOnReconnectPaused(ConnectErrorDetailCodes.PAIRING_REQUIRED)).toBe(
|
||||
false,
|
||||
);
|
||||
expect(exit).not.toHaveBeenCalled();
|
||||
expect(lines).toEqual([
|
||||
"node host gateway reconnect paused after close (1008): connect failed detail=PAIRING_REQUIRED; waiting for operator action",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import { loadConfig, type OpenClawConfig } from "../config/config.js";
|
||||
import { GatewayClient } from "../gateway/client.js";
|
||||
import { GatewayClient, type GatewayReconnectPausedInfo } from "../gateway/client.js";
|
||||
import { resolveGatewayConnectionAuth } from "../gateway/connection-auth.js";
|
||||
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../gateway/protocol/client-info.js";
|
||||
import { ConnectErrorDetailCodes } from "../gateway/protocol/connect-error-details.js";
|
||||
import { loadOrCreateDeviceIdentity } from "../infra/device-identity.js";
|
||||
import type { SkillBinTrustEntry } from "../infra/exec-approvals.js";
|
||||
import { resolveExecutableFromPathEnv } from "../infra/executable-path.js";
|
||||
@@ -38,6 +39,47 @@ function writeStderrLine(message: string): void {
|
||||
process.stderr.write(`${message}\n`);
|
||||
}
|
||||
|
||||
const NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES: ReadonlySet<string> = new Set([
|
||||
ConnectErrorDetailCodes.AUTH_TOKEN_MISSING,
|
||||
ConnectErrorDetailCodes.AUTH_TOKEN_MISMATCH,
|
||||
ConnectErrorDetailCodes.AUTH_BOOTSTRAP_TOKEN_INVALID,
|
||||
ConnectErrorDetailCodes.AUTH_PASSWORD_MISSING,
|
||||
ConnectErrorDetailCodes.AUTH_PASSWORD_MISMATCH,
|
||||
]);
|
||||
|
||||
type NodeHostReconnectPausedDeps = {
|
||||
writeLine?: (message: string) => void;
|
||||
exit?: (code: number) => never;
|
||||
};
|
||||
|
||||
export function shouldExitNodeHostOnReconnectPaused(detailCode: string | null): boolean {
|
||||
return detailCode !== null && NODE_HOST_EXIT_ON_RECONNECT_PAUSE_CODES.has(detailCode);
|
||||
}
|
||||
|
||||
export function formatNodeHostReconnectPausedMessage(
|
||||
info: GatewayReconnectPausedInfo,
|
||||
params?: { exiting?: boolean },
|
||||
): string {
|
||||
const detail = info.detailCode ? ` detail=${info.detailCode}` : "";
|
||||
const reason = info.reason.trim() || "no close reason";
|
||||
const action = params?.exiting ? "exiting for supervisor restart" : "waiting for operator action";
|
||||
return `node host gateway reconnect paused after close (${info.code}): ${reason}${detail}; ${action}`;
|
||||
}
|
||||
|
||||
export function handleNodeHostReconnectPaused(
|
||||
info: GatewayReconnectPausedInfo,
|
||||
deps: NodeHostReconnectPausedDeps = {},
|
||||
): void {
|
||||
const shouldExit = shouldExitNodeHostOnReconnectPaused(info.detailCode);
|
||||
const writeLine = deps.writeLine ?? writeStderrLine;
|
||||
writeLine(formatNodeHostReconnectPausedMessage(info, { exiting: shouldExit }));
|
||||
if (!shouldExit) {
|
||||
return;
|
||||
}
|
||||
const exit = deps.exit ?? ((code: number): never => process.exit(code));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
function resolveExecutablePathFromEnv(bin: string, pathEnv: string): string | null {
|
||||
if (bin.includes("/") || bin.includes("\\")) {
|
||||
return null;
|
||||
@@ -212,6 +254,9 @@ export async function runNodeHost(opts: NodeHostRunOptions): Promise<void> {
|
||||
// keep retrying (handled by GatewayClient)
|
||||
writeStderrLine(`node host gateway connect failed: ${err.message}`);
|
||||
},
|
||||
onReconnectPaused: (info) => {
|
||||
handleNodeHostReconnectPaused(info);
|
||||
},
|
||||
onClose: (code, reason) => {
|
||||
writeStderrLine(`node host gateway closed (${code}): ${reason}`);
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user