fix: harden windows gateway lifecycle

This commit is contained in:
Peter Steinberger
2026-03-13 18:33:59 +00:00
parent 84a2a289e6
commit 5ea03efe92
11 changed files with 680 additions and 171 deletions

View File

@@ -1,8 +1,5 @@
import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
const mockReadFileSync = vi.hoisted(() => vi.fn());
const mockSpawnSync = vi.hoisted(() => vi.fn());
type RestartHealthSnapshot = {
healthy: boolean;
staleGatewayPids: number[];
@@ -35,7 +32,9 @@ const terminateStaleGatewayPids = vi.fn();
const renderGatewayPortHealthDiagnostics = vi.fn(() => ["diag: unhealthy port"]);
const renderRestartDiagnostics = vi.fn(() => ["diag: unhealthy runtime"]);
const resolveGatewayPort = vi.fn(() => 18789);
const findGatewayPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []);
const findVerifiedGatewayListenerPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []);
const signalVerifiedGatewayPidSync = vi.fn<(pid: number, signal: "SIGTERM" | "SIGUSR1") => void>();
const formatGatewayPidList = vi.fn<(pids: number[]) => string>((pids) => pids.join(", "));
const probeGateway = vi.fn<
(opts: {
url: string;
@@ -49,24 +48,18 @@ const probeGateway = vi.fn<
const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true);
const loadConfig = vi.fn(() => ({}));
vi.mock("node:fs", () => ({
default: {
readFileSync: (...args: unknown[]) => mockReadFileSync(...args),
},
}));
vi.mock("node:child_process", () => ({
spawnSync: (...args: unknown[]) => mockSpawnSync(...args),
}));
vi.mock("../../config/config.js", () => ({
loadConfig: () => loadConfig(),
readBestEffortConfig: async () => loadConfig(),
resolveGatewayPort,
}));
vi.mock("../../infra/restart.js", () => ({
findGatewayPidsOnPortSync: (port: number) => findGatewayPidsOnPortSync(port),
vi.mock("../../infra/gateway-processes.js", () => ({
findVerifiedGatewayListenerPidsOnPortSync: (port: number) =>
findVerifiedGatewayListenerPidsOnPortSync(port),
signalVerifiedGatewayPidSync: (pid: number, signal: "SIGTERM" | "SIGUSR1") =>
signalVerifiedGatewayPidSync(pid, signal),
formatGatewayPidList: (pids: number[]) => formatGatewayPidList(pids),
}));
vi.mock("../../gateway/probe.js", () => ({
@@ -121,12 +114,12 @@ describe("runDaemonRestart health checks", () => {
renderGatewayPortHealthDiagnostics.mockReset();
renderRestartDiagnostics.mockReset();
resolveGatewayPort.mockReset();
findGatewayPidsOnPortSync.mockReset();
findVerifiedGatewayListenerPidsOnPortSync.mockReset();
signalVerifiedGatewayPidSync.mockReset();
formatGatewayPidList.mockReset();
probeGateway.mockReset();
isRestartEnabled.mockReset();
loadConfig.mockReset();
mockReadFileSync.mockReset();
mockSpawnSync.mockReset();
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway", "--port", "18789"],
@@ -158,23 +151,8 @@ describe("runDaemonRestart health checks", () => {
configSnapshot: { commands: { restart: true } },
});
isRestartEnabled.mockReturnValue(true);
mockReadFileSync.mockImplementation((path: string) => {
const match = path.match(/\/proc\/(\d+)\/cmdline$/);
if (!match) {
throw new Error(`unexpected path ${path}`);
}
const pid = Number.parseInt(match[1] ?? "", 10);
if ([4200, 4300].includes(pid)) {
return ["openclaw", "gateway", "--port", "18789", ""].join("\0");
}
throw new Error(`unknown pid ${pid}`);
});
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: "openclaw gateway --port 18789",
stderr: "",
});
signalVerifiedGatewayPidSync.mockImplementation(() => {});
formatGatewayPidList.mockImplementation((pids) => pids.join(", "));
});
afterEach(() => {
@@ -242,38 +220,20 @@ describe("runDaemonRestart health checks", () => {
});
it("signals an unmanaged gateway process on stop", async () => {
vi.spyOn(process, "platform", "get").mockReturnValue("win32");
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
findGatewayPidsOnPortSync.mockReturnValue([4200, 4200, 4300]);
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout:
'CommandLine="C:\\\\Program Files\\\\OpenClaw\\\\openclaw.exe" gateway --port 18789\r\n',
stderr: "",
});
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4200, 4300]);
runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise<unknown> }) => {
await params.onNotLoaded?.();
});
await runDaemonStop({ json: true });
expect(findGatewayPidsOnPortSync).toHaveBeenCalledWith(18789);
expect(killSpy).toHaveBeenCalledWith(4200, "SIGTERM");
expect(killSpy).toHaveBeenCalledWith(4300, "SIGTERM");
expect(findVerifiedGatewayListenerPidsOnPortSync).toHaveBeenCalledWith(18789);
expect(signalVerifiedGatewayPidSync).toHaveBeenCalledWith(4200, "SIGTERM");
expect(signalVerifiedGatewayPidSync).toHaveBeenCalledWith(4300, "SIGTERM");
});
it("signals a single unmanaged gateway process on restart", async () => {
vi.spyOn(process, "platform", "get").mockReturnValue("win32");
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
findGatewayPidsOnPortSync.mockReturnValue([4200]);
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout:
'CommandLine="C:\\\\Program Files\\\\OpenClaw\\\\openclaw.exe" gateway --port 18789\r\n',
stderr: "",
});
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200]);
runServiceRestart.mockImplementation(
async (params: RestartParams & { onNotLoaded?: () => Promise<unknown> }) => {
await params.onNotLoaded?.();
@@ -291,8 +251,8 @@ describe("runDaemonRestart health checks", () => {
await runDaemonRestart({ json: true });
expect(findGatewayPidsOnPortSync).toHaveBeenCalledWith(18789);
expect(killSpy).toHaveBeenCalledWith(4200, "SIGUSR1");
expect(findVerifiedGatewayListenerPidsOnPortSync).toHaveBeenCalledWith(18789);
expect(signalVerifiedGatewayPidSync).toHaveBeenCalledWith(4200, "SIGUSR1");
expect(probeGateway).toHaveBeenCalledTimes(1);
expect(waitForGatewayHealthyListener).toHaveBeenCalledTimes(1);
expect(waitForGatewayHealthyRestart).not.toHaveBeenCalled();
@@ -301,15 +261,7 @@ describe("runDaemonRestart health checks", () => {
});
it("fails unmanaged restart when multiple gateway listeners are present", async () => {
vi.spyOn(process, "platform", "get").mockReturnValue("win32");
findGatewayPidsOnPortSync.mockReturnValue([4200, 4300]);
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout:
'CommandLine="C:\\\\Program Files\\\\OpenClaw\\\\openclaw.exe" gateway --port 18789\r\n',
stderr: "",
});
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4300]);
runServiceRestart.mockImplementation(
async (params: RestartParams & { onNotLoaded?: () => Promise<unknown> }) => {
await params.onNotLoaded?.();
@@ -323,7 +275,7 @@ describe("runDaemonRestart health checks", () => {
});
it("fails unmanaged restart when the running gateway has commands.restart disabled", async () => {
findGatewayPidsOnPortSync.mockReturnValue([4200]);
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200]);
probeGateway.mockResolvedValue({
ok: true,
configSnapshot: { commands: { restart: false } },
@@ -342,21 +294,13 @@ describe("runDaemonRestart health checks", () => {
});
it("skips unmanaged signaling for pids that are not live gateway processes", async () => {
const killSpy = vi.spyOn(process, "kill").mockImplementation(() => true);
findGatewayPidsOnPortSync.mockReturnValue([4200]);
mockReadFileSync.mockReturnValue(["python", "-m", "http.server", ""].join("\0"));
mockSpawnSync.mockReturnValue({
error: null,
status: 0,
stdout: "python -m http.server",
stderr: "",
});
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([]);
runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise<unknown> }) => {
await params.onNotLoaded?.();
});
await runDaemonStop({ json: true });
expect(killSpy).not.toHaveBeenCalled();
expect(signalVerifiedGatewayPidSync).not.toHaveBeenCalled();
});
});

View File

@@ -1,12 +1,12 @@
import { spawnSync } from "node:child_process";
import fsSync from "node:fs";
import { isRestartEnabled } from "../../config/commands.js";
import { readBestEffortConfig, resolveGatewayPort } from "../../config/config.js";
import { parseCmdScriptCommandLine } from "../../daemon/cmd-argv.js";
import { resolveGatewayService } from "../../daemon/service.js";
import { probeGateway } from "../../gateway/probe.js";
import { isGatewayArgv, parseProcCmdline } from "../../infra/gateway-process-argv.js";
import { findGatewayPidsOnPortSync } from "../../infra/restart.js";
import {
findVerifiedGatewayListenerPidsOnPortSync,
formatGatewayPidList,
signalVerifiedGatewayPidSync,
} from "../../infra/gateway-processes.js";
import { defaultRuntime } from "../../runtime.js";
import { theme } from "../../terminal/theme.js";
import { formatCliCommand } from "../command-format.js";
@@ -43,85 +43,12 @@ async function resolveGatewayLifecyclePort(service = resolveGatewayService()) {
return portFromArgs ?? resolveGatewayPort(await readBestEffortConfig(), mergedEnv);
}
function extractWindowsCommandLine(raw: string): string | null {
const lines = raw
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
for (const line of lines) {
if (!line.toLowerCase().startsWith("commandline=")) {
continue;
}
const value = line.slice("commandline=".length).trim();
return value || null;
}
return lines.find((line) => line.toLowerCase() !== "commandline") ?? null;
}
function readGatewayProcessArgsSync(pid: number): string[] | null {
if (process.platform === "linux") {
try {
return parseProcCmdline(fsSync.readFileSync(`/proc/${pid}/cmdline`, "utf8"));
} catch {
return null;
}
}
if (process.platform === "darwin") {
const ps = spawnSync("ps", ["-o", "command=", "-p", String(pid)], {
encoding: "utf8",
timeout: 1000,
});
if (ps.error || ps.status !== 0) {
return null;
}
const command = ps.stdout.trim();
return command ? command.split(/\s+/) : null;
}
if (process.platform === "win32") {
const wmic = spawnSync(
"wmic",
["process", "where", `ProcessId=${pid}`, "get", "CommandLine", "/value"],
{
encoding: "utf8",
timeout: 1000,
},
);
if (wmic.error || wmic.status !== 0) {
return null;
}
const command = extractWindowsCommandLine(wmic.stdout);
return command ? parseCmdScriptCommandLine(command) : null;
}
return null;
}
function resolveGatewayListenerPids(port: number): number[] {
return Array.from(new Set(findGatewayPidsOnPortSync(port)))
.filter((pid): pid is number => Number.isFinite(pid) && pid > 0)
.filter((pid) => {
const args = readGatewayProcessArgsSync(pid);
return args != null && isGatewayArgv(args, { allowGatewayBinary: true });
});
}
function resolveGatewayPortFallback(): Promise<number> {
return readBestEffortConfig()
.then((cfg) => resolveGatewayPort(cfg, process.env))
.catch(() => resolveGatewayPort(undefined, process.env));
}
function signalGatewayPid(pid: number, signal: "SIGTERM" | "SIGUSR1") {
const args = readGatewayProcessArgsSync(pid);
if (!args || !isGatewayArgv(args, { allowGatewayBinary: true })) {
throw new Error(`refusing to signal non-gateway process pid ${pid}`);
}
process.kill(pid, signal);
}
function formatGatewayPidList(pids: number[]): string {
return pids.join(", ");
}
async function assertUnmanagedGatewayRestartEnabled(port: number): Promise<void> {
const probe = await probeGateway({
url: `ws://127.0.0.1:${port}`,
@@ -143,7 +70,7 @@ async function assertUnmanagedGatewayRestartEnabled(port: number): Promise<void>
}
function resolveVerifiedGatewayListenerPids(port: number): number[] {
return resolveGatewayListenerPids(port).filter(
return findVerifiedGatewayListenerPidsOnPortSync(port).filter(
(pid): pid is number => Number.isFinite(pid) && pid > 0,
);
}
@@ -154,7 +81,7 @@ async function stopGatewayWithoutServiceManager(port: number) {
return null;
}
for (const pid of pids) {
signalGatewayPid(pid, "SIGTERM");
signalVerifiedGatewayPidSync(pid, "SIGTERM");
}
return {
result: "stopped" as const,
@@ -173,7 +100,7 @@ async function restartGatewayWithoutServiceManager(port: number) {
`multiple gateway processes are listening on port ${port}: ${formatGatewayPidList(pids)}; use "openclaw gateway status --deep" before retrying restart`,
);
}
signalGatewayPid(pids[0], "SIGUSR1");
signalVerifiedGatewayPidSync(pids[0], "SIGUSR1");
return {
result: "restarted" as const,
message: `Gateway restart signal sent to unmanaged process on port ${port}: ${pids[0]}.`,

View File

@@ -190,6 +190,32 @@ describe("inspectGatewayRestart", () => {
);
});
it("treats a busy port as healthy when runtime status lags but the probe succeeds", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 9100, commandLine: "openclaw-gateway" }],
hints: [],
});
classifyPortListener.mockReturnValue("gateway");
probeGateway.mockResolvedValue({
ok: true,
close: null,
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({ service, port: 18789 });
expect(snapshot.healthy).toBe(true);
expect(snapshot.staleGatewayPids).toEqual([]);
});
it("treats auth-closed probe as healthy gateway reachability", async () => {
const snapshot = await inspectAmbiguousOwnershipWithProbe({
ok: false,

View File

@@ -65,7 +65,8 @@ async function confirmGatewayReachable(port: number): Promise<boolean> {
const probe = await probeGateway({
url: `ws://127.0.0.1:${port}`,
auth: token || password ? { token, password } : undefined,
timeoutMs: 1_000,
timeoutMs: 3_000,
includeDetails: false,
});
return probe.ok || looksLikeAuthClose(probe.close?.code, probe.close?.reason);
}
@@ -123,6 +124,22 @@ export async function inspectGatewayRestart(params: {
};
}
if (portUsage.status === "busy" && runtime.status !== "running") {
try {
const reachable = await confirmGatewayReachable(params.port);
if (reachable) {
return {
runtime,
portUsage,
healthy: true,
staleGatewayPids: [],
};
}
} catch {
// Probe is best-effort; keep the ownership-based diagnostics.
}
}
const gatewayListeners =
portUsage.status === "busy"
? portUsage.listeners.filter(