fix: preserve Windows scheduled task restart/install behavior (#59335) (thanks @tmimmanuel)

* fix(daemon): preserve Windows Task Scheduler settings on reinstall and exit early on failed restart

* fix(daemon): add test coverage for Create/Change paths, fix early exit grace period

* fix(daemon): fix startup-fallback tests for new isRegisteredScheduledTask call

* fix(daemon): report early restart failure accurately

* fix: preserve Windows scheduled task restart/install behavior (#59335) (thanks @tmimmanuel)

---------

Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
tmimmanuel
2026-04-04 05:16:00 +02:00
committed by GitHub
parent ff0c1b57a7
commit 0fef95b17d
9 changed files with 328 additions and 19 deletions

View File

@@ -6,6 +6,8 @@ type RestartHealthSnapshot = {
staleGatewayPids: number[];
runtime: { status?: string };
portUsage: { port: number; status: string; listeners: []; hints: []; errors?: string[] };
waitOutcome?: string;
elapsedMs?: number;
};
type RestartPostCheckContext = {
@@ -32,7 +34,7 @@ const waitForGatewayHealthyRestart = vi.fn();
const terminateStaleGatewayPids = vi.fn();
const renderGatewayPortHealthDiagnostics = vi.fn(() => ["diag: unhealthy port"]);
const renderRestartDiagnostics = vi.fn(() => ["diag: unhealthy runtime"]);
const resolveGatewayPort = vi.fn(() => 18789);
const resolveGatewayPort = vi.hoisted(() => vi.fn((_cfg?: unknown, _env?: unknown) => 18789));
const findVerifiedGatewayListenerPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []);
const signalVerifiedGatewayPidSync = vi.fn<(pid: number, signal: "SIGTERM" | "SIGUSR1") => void>();
const formatGatewayPidList = vi.fn<(pids: number[]) => string>((pids) => pids.join(", "));
@@ -47,12 +49,12 @@ const probeGateway = vi.fn<
}>
>();
const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true);
const loadConfig = vi.fn(() => ({}));
const loadConfig = vi.hoisted(() => vi.fn(() => ({})));
vi.mock("../../config/config.js", () => ({
loadConfig: () => loadConfig(),
readBestEffortConfig: async () => loadConfig(),
resolveGatewayPort,
resolveGatewayPort: (cfg?: unknown, env?: unknown) => resolveGatewayPort(cfg, env),
}));
vi.mock("../../infra/gateway-processes.js", () => ({
@@ -230,13 +232,15 @@ describe("runDaemonRestart health checks", () => {
expect(waitForGatewayHealthyRestart).toHaveBeenCalledTimes(1);
});
it("fails restart when gateway remains unhealthy", async () => {
it("fails restart when gateway remains unhealthy after the full timeout", async () => {
const { formatCliCommand } = await import("../command-format.js");
const unhealthy: RestartHealthSnapshot = {
healthy: false,
staleGatewayPids: [],
runtime: { status: "stopped" },
portUsage: { port: 18789, status: "free", listeners: [], hints: [] },
waitOutcome: "timeout",
elapsedMs: 60_000,
};
waitForGatewayHealthyRestart.mockResolvedValue(unhealthy);
@@ -251,6 +255,30 @@ describe("runDaemonRestart health checks", () => {
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
});
it("fails restart with a stopped-free message when the waiter exits early", async () => {
const { formatCliCommand } = await import("../command-format.js");
const unhealthy: RestartHealthSnapshot = {
healthy: false,
staleGatewayPids: [],
runtime: { status: "stopped" },
portUsage: { port: 18789, status: "free", listeners: [], hints: [] },
waitOutcome: "stopped-free",
elapsedMs: 12_500,
};
waitForGatewayHealthyRestart.mockResolvedValue(unhealthy);
await expect(runDaemonRestart({ json: true })).rejects.toMatchObject({
message:
"Gateway restart failed after 13s: service stayed stopped and health checks never came up.",
hints: [
formatCliCommand("openclaw gateway status --deep"),
formatCliCommand("openclaw doctor"),
],
});
expect(terminateStaleGatewayPids).not.toHaveBeenCalled();
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
});
it("signals an unmanaged gateway process on stop", async () => {
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4200, 4300]);
runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise<unknown> }) => {

View File

@@ -19,6 +19,7 @@ import {
import {
DEFAULT_RESTART_HEALTH_ATTEMPTS,
DEFAULT_RESTART_HEALTH_DELAY_MS,
type GatewayRestartSnapshot,
renderGatewayPortHealthDiagnostics,
renderRestartDiagnostics,
terminateStaleGatewayPids,
@@ -31,6 +32,25 @@ import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS;
const POST_RESTART_HEALTH_DELAY_MS = DEFAULT_RESTART_HEALTH_DELAY_MS;
function formatRestartFailure(params: {
health: GatewayRestartSnapshot;
port: number;
timeoutSeconds: number;
}): { statusLine: string; failMessage: string } {
if (params.health.waitOutcome === "stopped-free") {
const elapsedSeconds = Math.max(1, Math.round((params.health.elapsedMs ?? 0) / 1000));
return {
statusLine: `Gateway restart failed after ${elapsedSeconds}s: service stayed stopped and port ${params.port} stayed free.`,
failMessage: `Gateway restart failed after ${elapsedSeconds}s: service stayed stopped and health checks never came up.`,
};
}
return {
statusLine: `Timed out after ${params.timeoutSeconds}s waiting for gateway port ${params.port} to become healthy.`,
failMessage: `Gateway restart timed out after ${params.timeoutSeconds}s waiting for health checks.`,
};
}
async function resolveGatewayLifecyclePort(service = resolveGatewayService()) {
const command = await service.readCommand(process.env).catch(() => null);
const serviceEnv = command?.environment ?? undefined;
@@ -234,13 +254,17 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
}
const diagnostics = renderRestartDiagnostics(health);
const timeoutLine = `Timed out after ${restartWaitSeconds}s waiting for gateway port ${restartPort} to become healthy.`;
const failure = formatRestartFailure({
health,
port: restartPort,
timeoutSeconds: restartWaitSeconds,
});
const runningNoPortLine =
health.runtime.status === "running" && health.portUsage.status === "free"
? `Gateway process is running but port ${restartPort} is still free (startup hang/crash loop or very slow VM startup).`
: null;
if (!json) {
defaultRuntime.log(theme.warn(timeoutLine));
defaultRuntime.log(theme.warn(failure.statusLine));
if (runningNoPortLine) {
defaultRuntime.log(theme.warn(runningNoPortLine));
}
@@ -248,14 +272,14 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
defaultRuntime.log(theme.muted(line));
}
} else {
warnings.push(timeoutLine);
warnings.push(failure.statusLine);
if (runningNoPortLine) {
warnings.push(runningNoPortLine);
}
warnings.push(...diagnostics);
}
fail(`Gateway restart timed out after ${restartWaitSeconds}s waiting for health checks.`, [
fail(failure.failMessage, [
formatCliCommand("openclaw gateway status --deep"),
formatCliCommand("openclaw doctor"),
]);

View File

@@ -3,6 +3,7 @@ import type { GatewayService } from "../../daemon/service.js";
import type { PortListenerKind, PortUsage } from "../../infra/ports.js";
const inspectPortUsage = vi.hoisted(() => vi.fn<(port: number) => Promise<PortUsage>>());
const sleep = vi.hoisted(() => vi.fn(async (_ms: number) => {}));
const classifyPortListener = vi.hoisted(() =>
vi.fn<(_listener: unknown, _port: number) => PortListenerKind>(() => "gateway"),
);
@@ -18,6 +19,14 @@ vi.mock("../../gateway/probe.js", () => ({
probeGateway: (opts: unknown) => probeGateway(opts),
}));
vi.mock("../../utils.js", async () => {
const actual = await vi.importActual<typeof import("../../utils.js")>("../../utils.js");
return {
...actual,
sleep: (ms: number) => sleep(ms),
};
});
const originalPlatform = process.platform;
function makeGatewayService(
@@ -88,6 +97,7 @@ describe("inspectGatewayRestart", () => {
listeners: [],
hints: [],
});
sleep.mockReset();
classifyPortListener.mockReset();
classifyPortListener.mockReturnValue("gateway");
probeGateway.mockReset();
@@ -240,4 +250,58 @@ describe("inspectGatewayRestart", () => {
expect(snapshot.healthy).toBe(true);
expect(probeGateway).not.toHaveBeenCalled();
});
it("annotates stopped-free early exits with the actual elapsed time", async () => {
const service = makeGatewayService({ status: "stopped" });
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "free",
listeners: [],
hints: [],
});
const { waitForGatewayHealthyRestart } = await import("./restart-health.js");
const snapshot = await waitForGatewayHealthyRestart({
service,
port: 18789,
attempts: 120,
delayMs: 500,
});
expect(snapshot).toMatchObject({
healthy: false,
runtime: { status: "stopped" },
portUsage: { status: "free" },
waitOutcome: "stopped-free",
elapsedMs: 12_500,
});
expect(sleep).toHaveBeenCalledTimes(25);
});
it("annotates timeout waits when the health loop exhausts all attempts", async () => {
const service = makeGatewayService({ status: "running", pid: 8000 });
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "free",
listeners: [],
hints: [],
});
const { waitForGatewayHealthyRestart } = await import("./restart-health.js");
const snapshot = await waitForGatewayHealthyRestart({
service,
port: 18789,
attempts: 4,
delayMs: 1_000,
});
expect(snapshot).toMatchObject({
healthy: false,
runtime: { status: "running", pid: 8000 },
portUsage: { status: "free" },
waitOutcome: "timeout",
elapsedMs: 4_000,
});
expect(sleep).toHaveBeenCalledTimes(4);
});
});

View File

@@ -16,11 +16,15 @@ export const DEFAULT_RESTART_HEALTH_ATTEMPTS = Math.ceil(
DEFAULT_RESTART_HEALTH_TIMEOUT_MS / DEFAULT_RESTART_HEALTH_DELAY_MS,
);
export type GatewayRestartWaitOutcome = "healthy" | "stale-pids" | "stopped-free" | "timeout";
export type GatewayRestartSnapshot = {
runtime: GatewayServiceRuntime;
portUsage: PortUsage;
healthy: boolean;
staleGatewayPids: number[];
waitOutcome?: GatewayRestartWaitOutcome;
elapsedMs?: number;
};
export type GatewayPortHealthSnapshot = {
@@ -201,6 +205,26 @@ export async function inspectGatewayRestart(params: {
};
}
function shouldEarlyExitStoppedFree(
snapshot: GatewayRestartSnapshot,
attempt: number,
minAttempt: number,
): boolean {
return (
attempt >= minAttempt &&
snapshot.runtime.status === "stopped" &&
snapshot.portUsage.status === "free"
);
}
function withWaitContext(
snapshot: GatewayRestartSnapshot,
waitOutcome: GatewayRestartWaitOutcome,
elapsedMs: number,
): GatewayRestartSnapshot {
return { ...snapshot, waitOutcome, elapsedMs };
}
export async function waitForGatewayHealthyRestart(params: {
service: GatewayService;
port: number;
@@ -219,12 +243,24 @@ export async function waitForGatewayHealthyRestart(params: {
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
});
let consecutiveStoppedFreeCount = 0;
const STOPPED_FREE_THRESHOLD = 6;
const minAttemptForEarlyExit = Math.min(Math.ceil(10_000 / delayMs), Math.floor(attempts / 2));
for (let attempt = 0; attempt < attempts; attempt += 1) {
if (snapshot.healthy) {
return snapshot;
return withWaitContext(snapshot, "healthy", attempt * delayMs);
}
if (snapshot.staleGatewayPids.length > 0 && snapshot.runtime.status !== "running") {
return snapshot;
return withWaitContext(snapshot, "stale-pids", attempt * delayMs);
}
if (shouldEarlyExitStoppedFree(snapshot, attempt, minAttemptForEarlyExit)) {
consecutiveStoppedFreeCount += 1;
if (consecutiveStoppedFreeCount >= STOPPED_FREE_THRESHOLD) {
return withWaitContext(snapshot, "stopped-free", attempt * delayMs);
}
} else if (snapshot.runtime.status !== "stopped" || snapshot.portUsage.status !== "free") {
consecutiveStoppedFreeCount = 0;
}
await sleep(delayMs);
snapshot = await inspectGatewayRestart({
@@ -235,7 +271,7 @@ export async function waitForGatewayHealthyRestart(params: {
});
}
return snapshot;
return withWaitContext(snapshot, "timeout", attempts * delayMs);
}
export async function waitForGatewayHealthyListener(params: {