mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-07 15:21:06 +00:00
fix: preserve Windows scheduled task restart/install behavior (#59335) (thanks @tmimmanuel)
* fix(daemon): preserve Windows Task Scheduler settings on reinstall and exit early on failed restart * fix(daemon): add test coverage for Create/Change paths, fix early exit grace period * fix(daemon): fix startup-fallback tests for new isRegisteredScheduledTask call * fix(daemon): report early restart failure accurately * fix: preserve Windows scheduled task restart/install behavior (#59335) (thanks @tmimmanuel) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
This commit is contained in:
@@ -6,6 +6,8 @@ type RestartHealthSnapshot = {
|
||||
staleGatewayPids: number[];
|
||||
runtime: { status?: string };
|
||||
portUsage: { port: number; status: string; listeners: []; hints: []; errors?: string[] };
|
||||
waitOutcome?: string;
|
||||
elapsedMs?: number;
|
||||
};
|
||||
|
||||
type RestartPostCheckContext = {
|
||||
@@ -32,7 +34,7 @@ const waitForGatewayHealthyRestart = vi.fn();
|
||||
const terminateStaleGatewayPids = vi.fn();
|
||||
const renderGatewayPortHealthDiagnostics = vi.fn(() => ["diag: unhealthy port"]);
|
||||
const renderRestartDiagnostics = vi.fn(() => ["diag: unhealthy runtime"]);
|
||||
const resolveGatewayPort = vi.fn(() => 18789);
|
||||
const resolveGatewayPort = vi.hoisted(() => vi.fn((_cfg?: unknown, _env?: unknown) => 18789));
|
||||
const findVerifiedGatewayListenerPidsOnPortSync = vi.fn<(port: number) => number[]>(() => []);
|
||||
const signalVerifiedGatewayPidSync = vi.fn<(pid: number, signal: "SIGTERM" | "SIGUSR1") => void>();
|
||||
const formatGatewayPidList = vi.fn<(pids: number[]) => string>((pids) => pids.join(", "));
|
||||
@@ -47,12 +49,12 @@ const probeGateway = vi.fn<
|
||||
}>
|
||||
>();
|
||||
const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true);
|
||||
const loadConfig = vi.fn(() => ({}));
|
||||
const loadConfig = vi.hoisted(() => vi.fn(() => ({})));
|
||||
|
||||
vi.mock("../../config/config.js", () => ({
|
||||
loadConfig: () => loadConfig(),
|
||||
readBestEffortConfig: async () => loadConfig(),
|
||||
resolveGatewayPort,
|
||||
resolveGatewayPort: (cfg?: unknown, env?: unknown) => resolveGatewayPort(cfg, env),
|
||||
}));
|
||||
|
||||
vi.mock("../../infra/gateway-processes.js", () => ({
|
||||
@@ -230,13 +232,15 @@ describe("runDaemonRestart health checks", () => {
|
||||
expect(waitForGatewayHealthyRestart).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("fails restart when gateway remains unhealthy", async () => {
|
||||
it("fails restart when gateway remains unhealthy after the full timeout", async () => {
|
||||
const { formatCliCommand } = await import("../command-format.js");
|
||||
const unhealthy: RestartHealthSnapshot = {
|
||||
healthy: false,
|
||||
staleGatewayPids: [],
|
||||
runtime: { status: "stopped" },
|
||||
portUsage: { port: 18789, status: "free", listeners: [], hints: [] },
|
||||
waitOutcome: "timeout",
|
||||
elapsedMs: 60_000,
|
||||
};
|
||||
waitForGatewayHealthyRestart.mockResolvedValue(unhealthy);
|
||||
|
||||
@@ -251,6 +255,30 @@ describe("runDaemonRestart health checks", () => {
|
||||
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("fails restart with a stopped-free message when the waiter exits early", async () => {
|
||||
const { formatCliCommand } = await import("../command-format.js");
|
||||
const unhealthy: RestartHealthSnapshot = {
|
||||
healthy: false,
|
||||
staleGatewayPids: [],
|
||||
runtime: { status: "stopped" },
|
||||
portUsage: { port: 18789, status: "free", listeners: [], hints: [] },
|
||||
waitOutcome: "stopped-free",
|
||||
elapsedMs: 12_500,
|
||||
};
|
||||
waitForGatewayHealthyRestart.mockResolvedValue(unhealthy);
|
||||
|
||||
await expect(runDaemonRestart({ json: true })).rejects.toMatchObject({
|
||||
message:
|
||||
"Gateway restart failed after 13s: service stayed stopped and health checks never came up.",
|
||||
hints: [
|
||||
formatCliCommand("openclaw gateway status --deep"),
|
||||
formatCliCommand("openclaw doctor"),
|
||||
],
|
||||
});
|
||||
expect(terminateStaleGatewayPids).not.toHaveBeenCalled();
|
||||
expect(renderRestartDiagnostics).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("signals an unmanaged gateway process on stop", async () => {
|
||||
findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4200, 4300]);
|
||||
runServiceStop.mockImplementation(async (params: { onNotLoaded?: () => Promise<unknown> }) => {
|
||||
|
||||
@@ -19,6 +19,7 @@ import {
|
||||
import {
|
||||
DEFAULT_RESTART_HEALTH_ATTEMPTS,
|
||||
DEFAULT_RESTART_HEALTH_DELAY_MS,
|
||||
type GatewayRestartSnapshot,
|
||||
renderGatewayPortHealthDiagnostics,
|
||||
renderRestartDiagnostics,
|
||||
terminateStaleGatewayPids,
|
||||
@@ -31,6 +32,25 @@ import type { DaemonLifecycleOptions } from "./types.js";
|
||||
const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS;
|
||||
const POST_RESTART_HEALTH_DELAY_MS = DEFAULT_RESTART_HEALTH_DELAY_MS;
|
||||
|
||||
function formatRestartFailure(params: {
|
||||
health: GatewayRestartSnapshot;
|
||||
port: number;
|
||||
timeoutSeconds: number;
|
||||
}): { statusLine: string; failMessage: string } {
|
||||
if (params.health.waitOutcome === "stopped-free") {
|
||||
const elapsedSeconds = Math.max(1, Math.round((params.health.elapsedMs ?? 0) / 1000));
|
||||
return {
|
||||
statusLine: `Gateway restart failed after ${elapsedSeconds}s: service stayed stopped and port ${params.port} stayed free.`,
|
||||
failMessage: `Gateway restart failed after ${elapsedSeconds}s: service stayed stopped and health checks never came up.`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
statusLine: `Timed out after ${params.timeoutSeconds}s waiting for gateway port ${params.port} to become healthy.`,
|
||||
failMessage: `Gateway restart timed out after ${params.timeoutSeconds}s waiting for health checks.`,
|
||||
};
|
||||
}
|
||||
|
||||
async function resolveGatewayLifecyclePort(service = resolveGatewayService()) {
|
||||
const command = await service.readCommand(process.env).catch(() => null);
|
||||
const serviceEnv = command?.environment ?? undefined;
|
||||
@@ -234,13 +254,17 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
|
||||
}
|
||||
|
||||
const diagnostics = renderRestartDiagnostics(health);
|
||||
const timeoutLine = `Timed out after ${restartWaitSeconds}s waiting for gateway port ${restartPort} to become healthy.`;
|
||||
const failure = formatRestartFailure({
|
||||
health,
|
||||
port: restartPort,
|
||||
timeoutSeconds: restartWaitSeconds,
|
||||
});
|
||||
const runningNoPortLine =
|
||||
health.runtime.status === "running" && health.portUsage.status === "free"
|
||||
? `Gateway process is running but port ${restartPort} is still free (startup hang/crash loop or very slow VM startup).`
|
||||
: null;
|
||||
if (!json) {
|
||||
defaultRuntime.log(theme.warn(timeoutLine));
|
||||
defaultRuntime.log(theme.warn(failure.statusLine));
|
||||
if (runningNoPortLine) {
|
||||
defaultRuntime.log(theme.warn(runningNoPortLine));
|
||||
}
|
||||
@@ -248,14 +272,14 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
|
||||
defaultRuntime.log(theme.muted(line));
|
||||
}
|
||||
} else {
|
||||
warnings.push(timeoutLine);
|
||||
warnings.push(failure.statusLine);
|
||||
if (runningNoPortLine) {
|
||||
warnings.push(runningNoPortLine);
|
||||
}
|
||||
warnings.push(...diagnostics);
|
||||
}
|
||||
|
||||
fail(`Gateway restart timed out after ${restartWaitSeconds}s waiting for health checks.`, [
|
||||
fail(failure.failMessage, [
|
||||
formatCliCommand("openclaw gateway status --deep"),
|
||||
formatCliCommand("openclaw doctor"),
|
||||
]);
|
||||
|
||||
@@ -3,6 +3,7 @@ import type { GatewayService } from "../../daemon/service.js";
|
||||
import type { PortListenerKind, PortUsage } from "../../infra/ports.js";
|
||||
|
||||
const inspectPortUsage = vi.hoisted(() => vi.fn<(port: number) => Promise<PortUsage>>());
|
||||
const sleep = vi.hoisted(() => vi.fn(async (_ms: number) => {}));
|
||||
const classifyPortListener = vi.hoisted(() =>
|
||||
vi.fn<(_listener: unknown, _port: number) => PortListenerKind>(() => "gateway"),
|
||||
);
|
||||
@@ -18,6 +19,14 @@ vi.mock("../../gateway/probe.js", () => ({
|
||||
probeGateway: (opts: unknown) => probeGateway(opts),
|
||||
}));
|
||||
|
||||
vi.mock("../../utils.js", async () => {
|
||||
const actual = await vi.importActual<typeof import("../../utils.js")>("../../utils.js");
|
||||
return {
|
||||
...actual,
|
||||
sleep: (ms: number) => sleep(ms),
|
||||
};
|
||||
});
|
||||
|
||||
const originalPlatform = process.platform;
|
||||
|
||||
function makeGatewayService(
|
||||
@@ -88,6 +97,7 @@ describe("inspectGatewayRestart", () => {
|
||||
listeners: [],
|
||||
hints: [],
|
||||
});
|
||||
sleep.mockReset();
|
||||
classifyPortListener.mockReset();
|
||||
classifyPortListener.mockReturnValue("gateway");
|
||||
probeGateway.mockReset();
|
||||
@@ -240,4 +250,58 @@ describe("inspectGatewayRestart", () => {
|
||||
expect(snapshot.healthy).toBe(true);
|
||||
expect(probeGateway).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("annotates stopped-free early exits with the actual elapsed time", async () => {
|
||||
const service = makeGatewayService({ status: "stopped" });
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "free",
|
||||
listeners: [],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { waitForGatewayHealthyRestart } = await import("./restart-health.js");
|
||||
const snapshot = await waitForGatewayHealthyRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
attempts: 120,
|
||||
delayMs: 500,
|
||||
});
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
healthy: false,
|
||||
runtime: { status: "stopped" },
|
||||
portUsage: { status: "free" },
|
||||
waitOutcome: "stopped-free",
|
||||
elapsedMs: 12_500,
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledTimes(25);
|
||||
});
|
||||
|
||||
it("annotates timeout waits when the health loop exhausts all attempts", async () => {
|
||||
const service = makeGatewayService({ status: "running", pid: 8000 });
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "free",
|
||||
listeners: [],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { waitForGatewayHealthyRestart } = await import("./restart-health.js");
|
||||
const snapshot = await waitForGatewayHealthyRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
attempts: 4,
|
||||
delayMs: 1_000,
|
||||
});
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
healthy: false,
|
||||
runtime: { status: "running", pid: 8000 },
|
||||
portUsage: { status: "free" },
|
||||
waitOutcome: "timeout",
|
||||
elapsedMs: 4_000,
|
||||
});
|
||||
expect(sleep).toHaveBeenCalledTimes(4);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -16,11 +16,15 @@ export const DEFAULT_RESTART_HEALTH_ATTEMPTS = Math.ceil(
|
||||
DEFAULT_RESTART_HEALTH_TIMEOUT_MS / DEFAULT_RESTART_HEALTH_DELAY_MS,
|
||||
);
|
||||
|
||||
export type GatewayRestartWaitOutcome = "healthy" | "stale-pids" | "stopped-free" | "timeout";
|
||||
|
||||
export type GatewayRestartSnapshot = {
|
||||
runtime: GatewayServiceRuntime;
|
||||
portUsage: PortUsage;
|
||||
healthy: boolean;
|
||||
staleGatewayPids: number[];
|
||||
waitOutcome?: GatewayRestartWaitOutcome;
|
||||
elapsedMs?: number;
|
||||
};
|
||||
|
||||
export type GatewayPortHealthSnapshot = {
|
||||
@@ -201,6 +205,26 @@ export async function inspectGatewayRestart(params: {
|
||||
};
|
||||
}
|
||||
|
||||
function shouldEarlyExitStoppedFree(
|
||||
snapshot: GatewayRestartSnapshot,
|
||||
attempt: number,
|
||||
minAttempt: number,
|
||||
): boolean {
|
||||
return (
|
||||
attempt >= minAttempt &&
|
||||
snapshot.runtime.status === "stopped" &&
|
||||
snapshot.portUsage.status === "free"
|
||||
);
|
||||
}
|
||||
|
||||
function withWaitContext(
|
||||
snapshot: GatewayRestartSnapshot,
|
||||
waitOutcome: GatewayRestartWaitOutcome,
|
||||
elapsedMs: number,
|
||||
): GatewayRestartSnapshot {
|
||||
return { ...snapshot, waitOutcome, elapsedMs };
|
||||
}
|
||||
|
||||
export async function waitForGatewayHealthyRestart(params: {
|
||||
service: GatewayService;
|
||||
port: number;
|
||||
@@ -219,12 +243,24 @@ export async function waitForGatewayHealthyRestart(params: {
|
||||
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
|
||||
});
|
||||
|
||||
let consecutiveStoppedFreeCount = 0;
|
||||
const STOPPED_FREE_THRESHOLD = 6;
|
||||
const minAttemptForEarlyExit = Math.min(Math.ceil(10_000 / delayMs), Math.floor(attempts / 2));
|
||||
|
||||
for (let attempt = 0; attempt < attempts; attempt += 1) {
|
||||
if (snapshot.healthy) {
|
||||
return snapshot;
|
||||
return withWaitContext(snapshot, "healthy", attempt * delayMs);
|
||||
}
|
||||
if (snapshot.staleGatewayPids.length > 0 && snapshot.runtime.status !== "running") {
|
||||
return snapshot;
|
||||
return withWaitContext(snapshot, "stale-pids", attempt * delayMs);
|
||||
}
|
||||
if (shouldEarlyExitStoppedFree(snapshot, attempt, minAttemptForEarlyExit)) {
|
||||
consecutiveStoppedFreeCount += 1;
|
||||
if (consecutiveStoppedFreeCount >= STOPPED_FREE_THRESHOLD) {
|
||||
return withWaitContext(snapshot, "stopped-free", attempt * delayMs);
|
||||
}
|
||||
} else if (snapshot.runtime.status !== "stopped" || snapshot.portUsage.status !== "free") {
|
||||
consecutiveStoppedFreeCount = 0;
|
||||
}
|
||||
await sleep(delayMs);
|
||||
snapshot = await inspectGatewayRestart({
|
||||
@@ -235,7 +271,7 @@ export async function waitForGatewayHealthyRestart(params: {
|
||||
});
|
||||
}
|
||||
|
||||
return snapshot;
|
||||
return withWaitContext(snapshot, "timeout", attempts * delayMs);
|
||||
}
|
||||
|
||||
export async function waitForGatewayHealthyListener(params: {
|
||||
|
||||
Reference in New Issue
Block a user