From 3dff1272e963d7f72ab2df3c50362cd70f99c4bc Mon Sep 17 00:00:00 2001 From: Thatgfsj Date: Tue, 28 Apr 2026 14:57:47 +0800 Subject: [PATCH] fix: harden Windows gateway restart fallback (#69056) Thanks @Thatgfsj. --- CHANGELOG.md | 1 + src/infra/infra-runtime.test.ts | 116 ++++++++++++++++++++++++++++++++ src/infra/restart.ts | 29 ++++++-- 3 files changed, 142 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8de04e51bb..549ea9b434f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Build/runtime: write the runtime-postbuild stamp after `pnpm build` writes the build stamp, so the next CLI invocation does not re-sync runtime artifacts after a successful build. Fixes #73151. Thanks @bittoby. - CLI/model probes: reject empty or whitespace-only `infer model run --prompt` values before calling local providers or the Gateway, so smoke checks do not spend provider calls on invalid turns. Fixes #73185. Thanks @iot2edge. - Gateway/media: route text-only `chat.send` image offloads through media-understanding fields so `agents.defaults.imageModel` can describe WebChat attachments instead of leaving only an opaque `media://inbound` marker. Fixes #72968. Thanks @vorajeeah. +- Gateway/Windows: route no-listener restart handoffs through the Windows supervisor without leaving restart tokens in flight, so failed task scheduling can be retried and successful handoffs do not coalesce later restart requests. (#69056) Thanks @Thatgfsj. - Gateway/sessions: remove automatic oversized `sessions.json` rotation backups, deprecate `session.maintenance.rotateBytes`, and teach `openclaw doctor --fix` to remove the ignored key so hot session writes no longer copy multi-MB stores. Refs #72338. Thanks @midhunmonachan and @DougButdorf. - Channels/Telegram: fail fast when Telegram rejects the startup `getMe` token probe with 401, so invalid or stale BotFather tokens are reported as token auth failures instead of misleading `deleteWebhook` cleanup failures. Fixes #47674. Thanks @samaedan-arch. - ACPX: keep generated Codex and Claude ACP wrapper startup paths working when remote or special state filesystems reject chmod, since OpenClaw invokes the wrappers through Node instead of executing them directly. Fixes #73333. Thanks @david-garcia-garcia. diff --git a/src/infra/infra-runtime.test.ts b/src/infra/infra-runtime.test.ts index 704835e3dba..90188508482 100644 --- a/src/infra/infra-runtime.test.ts +++ b/src/infra/infra-runtime.test.ts @@ -19,10 +19,76 @@ import { } from "./restart.js"; import { listTailnetAddresses } from "./tailnet.js"; +const relaunchGatewayScheduledTaskMock = vi.hoisted(() => vi.fn()); +const cleanStaleGatewayProcessesSyncMock = vi.hoisted(() => vi.fn()); +const findGatewayPidsOnPortSyncMock = vi.hoisted(() => vi.fn()); + +vi.mock("./restart-stale-pids.js", () => ({ + cleanStaleGatewayProcessesSync: (...args: unknown[]) => + cleanStaleGatewayProcessesSyncMock(...args), + findGatewayPidsOnPortSync: (...args: unknown[]) => findGatewayPidsOnPortSyncMock(...args), +})); + +vi.mock("./windows-task-restart.js", () => ({ + relaunchGatewayScheduledTask: (...args: unknown[]) => relaunchGatewayScheduledTaskMock(...args), +})); + +const originalPlatformDescriptor = Object.getOwnPropertyDescriptor(process, "platform"); + +function setPlatform(platform: NodeJS.Platform): void { + if (!originalPlatformDescriptor) { + return; + } + Object.defineProperty(process, "platform", { + ...originalPlatformDescriptor, + value: platform, + }); +} + +function withoutSigusr1Listeners(fn: () => void): void { + const listeners = process.listeners("SIGUSR1"); + process.removeAllListeners("SIGUSR1"); + try { + fn(); + } finally { + process.removeAllListeners("SIGUSR1"); + for (const listener of listeners) { + process.on("SIGUSR1", listener); + } + } +} + +function withRestartSupervisorEnabled(fn: () => void): void { + const originalVitest = process.env.VITEST; + const originalNodeEnv = process.env.NODE_ENV; + delete process.env.VITEST; + delete process.env.NODE_ENV; + try { + fn(); + } finally { + if (originalVitest === undefined) { + delete process.env.VITEST; + } else { + process.env.VITEST = originalVitest; + } + if (originalNodeEnv === undefined) { + delete process.env.NODE_ENV; + } else { + process.env.NODE_ENV = originalNodeEnv; + } + } +} + describe("infra runtime", () => { function setupRestartSignalSuite() { beforeEach(() => { __testing.resetSigusr1State(); + relaunchGatewayScheduledTaskMock.mockReset(); + relaunchGatewayScheduledTaskMock.mockReturnValue({ ok: true, method: "schtasks" }); + cleanStaleGatewayProcessesSyncMock.mockReset(); + cleanStaleGatewayProcessesSyncMock.mockReturnValue([]); + findGatewayPidsOnPortSyncMock.mockReset(); + findGatewayPidsOnPortSyncMock.mockReturnValue([]); vi.useFakeTimers(); vi.spyOn(process, "kill").mockImplementation(() => true); }); @@ -33,6 +99,9 @@ describe("infra runtime", () => { clearConfigCache(); await vi.runOnlyPendingTimersAsync(); vi.useRealTimers(); + if (originalPlatformDescriptor) { + Object.defineProperty(process, "platform", originalPlatformDescriptor); + } vi.restoreAllMocks(); }); } @@ -80,6 +149,53 @@ describe("infra runtime", () => { } }); + it("uses the SIGUSR1 listener path on Windows when the run loop is active", () => { + setPlatform("win32"); + const emitSpy = vi.spyOn(process, "emit"); + const handler = () => {}; + process.on("SIGUSR1", handler); + try { + expect(emitGatewayRestart()).toBe(true); + expect(emitSpy).toHaveBeenCalledWith("SIGUSR1"); + expect(relaunchGatewayScheduledTaskMock).not.toHaveBeenCalled(); + } finally { + process.removeListener("SIGUSR1", handler); + } + }); + + it("uses the Windows supervisor fallback without leaving a restart cycle in flight", () => { + setPlatform("win32"); + withoutSigusr1Listeners(() => { + withRestartSupervisorEnabled(() => { + relaunchGatewayScheduledTaskMock.mockReturnValueOnce({ ok: true, method: "schtasks" }); + + expect(emitGatewayRestart("windows-fallback")).toBe(true); + + expect(relaunchGatewayScheduledTaskMock).toHaveBeenCalledTimes(1); + expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false); + const next = scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "next" }); + expect(next.coalesced).toBe(false); + expect(next.mode).toBe("supervisor"); + }); + }); + }); + + it("rolls back the Windows supervisor fallback when scheduling fails", () => { + setPlatform("win32"); + withoutSigusr1Listeners(() => { + withRestartSupervisorEnabled(() => { + relaunchGatewayScheduledTaskMock + .mockReturnValueOnce({ ok: false, method: "schtasks", detail: "denied" }) + .mockReturnValueOnce({ ok: true, method: "schtasks" }); + + expect(emitGatewayRestart("windows-fallback")).toBe(false); + expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false); + expect(emitGatewayRestart("windows-retry")).toBe(true); + expect(relaunchGatewayScheduledTaskMock).toHaveBeenCalledTimes(2); + }); + }); + }); + it("coalesces duplicate scheduled restarts into a single pending timer", async () => { const emitSpy = vi.spyOn(process, "emit"); const handler = () => {}; diff --git a/src/infra/restart.ts b/src/infra/restart.ts index bea0bcaab1b..5b32de994a9 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -265,14 +265,28 @@ export function emitGatewayRestart(reasonOverride?: string): boolean { authorizeGatewaySigusr1Restart(); try { if (process.listenerCount("SIGUSR1") > 0) { + // Signal path: let the run-loop's SIGUSR1 handler drive restart. + // Works on all platforms including Windows when a listener is registered. process.emit("SIGUSR1"); + } else if (process.platform === "win32") { + // On Windows with no SIGUSR1 listener, fall back to task-scheduler handoff. + // triggerOpenClawRestart() uses schtasks to restart the gateway. + const result = triggerOpenClawRestart(); + if (!result.ok) { + // Roll back the cycle marker so future restart requests can still proceed. + rollBackGatewayRestartEmission(); + restartLog.warn("Windows scheduled task restart failed, token rolled back"); + return false; + } + consumeGatewaySigusr1RestartAuthorization(); + markGatewaySigusr1RestartHandled(); } else { + // Unix without listener: send signal directly. process.kill(process.pid, "SIGUSR1"); } } catch { // Roll back the cycle marker so future restart requests can still proceed. - emittedRestartToken = consumedRestartToken; - emittedRestartReason = undefined; + rollBackGatewayRestartEmission(); return false; } lastRestartEmittedAt = Date.now(); @@ -335,6 +349,12 @@ export function markGatewaySigusr1RestartHandled(): void { } } +function rollBackGatewayRestartEmission(): void { + emittedRestartToken = consumedRestartToken; + emittedRestartReason = undefined; + consumeGatewaySigusr1RestartAuthorization(); +} + export type RestartDeferralHooks = { onDeferring?: (pending: number) => void; onStillPending?: (pending: number, elapsedMs: number) => void; @@ -617,7 +637,7 @@ export type ScheduledRestart = { signal: "SIGUSR1"; delayMs: number; reason?: string; - mode: "emit" | "signal"; + mode: "emit" | "signal" | "supervisor"; coalesced: boolean; cooldownMsApplied: number; }; @@ -637,7 +657,8 @@ export function scheduleGatewaySigusr1Restart(opts?: { typeof opts?.reason === "string" && opts.reason.trim() ? opts.reason.trim().slice(0, 200) : undefined; - const mode = process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal"; + const hasSigusr1Listener = process.listenerCount("SIGUSR1") > 0; + const mode = hasSigusr1Listener ? "emit" : process.platform === "win32" ? "supervisor" : "signal"; const nowMs = Date.now(); const cooldownMsApplied = Math.max(0, lastRestartEmittedAt + RESTART_COOLDOWN_MS - nowMs); const requestedDueAt = nowMs + delayMs + cooldownMsApplied;