fix: add gateway supervisor restart handoff

This commit is contained in:
Shakker
2026-05-05 05:31:47 +01:00
parent f9da484365
commit acb0acd8dd
5 changed files with 627 additions and 5 deletions

View File

@@ -19,6 +19,7 @@ export {
resetGatewayRestartStateForInProcessRestart,
scheduleGatewaySigusr1Restart,
} from "../../infra/restart.js";
export { writeGatewayRestartHandoffSync } from "../../infra/restart-handoff.js";
export { markUpdateRestartSentinelFailure } from "../../infra/restart-sentinel.js";
export { detectRespawnSupervisor } from "../../infra/supervisor-markers.js";
export { writeDiagnosticStabilityBundleForFailureSync } from "../../logging/diagnostic-stability-bundle.js";

View File

@@ -14,6 +14,17 @@ const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
const markGatewaySigusr1RestartHandled = vi.fn();
const peekGatewaySigusr1RestartReason = vi.fn<() => string | undefined>(() => undefined);
const resetGatewayRestartStateForInProcessRestart = vi.fn();
const writeGatewayRestartHandoffSync = vi.fn((_opts: unknown) => ({
kind: "gateway-supervisor-restart-handoff" as const,
version: 1 as const,
intentId: "test-intent",
pid: process.pid,
createdAt: Date.now(),
expiresAt: Date.now() + 60_000,
source: "unknown" as const,
restartKind: "full-process" as const,
supervisorMode: "external" as const,
}));
const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?: string }) => ({
ok: true,
pid: process.pid,
@@ -107,6 +118,10 @@ vi.mock("../../infra/restart-sentinel.js", () => ({
markUpdateRestartSentinelFailure: (reason: string) => markUpdateRestartSentinelFailure(reason),
}));
vi.mock("../../infra/restart-handoff.js", () => ({
writeGatewayRestartHandoffSync: (opts: unknown) => writeGatewayRestartHandoffSync(opts),
}));
vi.mock("../../process/command-queue.js", () => ({
getActiveTaskCount: () => getActiveTaskCount(),
markGatewayDraining: () => markGatewayDraining(),
@@ -595,6 +610,7 @@ describe("runGatewayLoop", () => {
expect(lockRelease).toHaveBeenCalled();
expect(runtime.exit).toHaveBeenCalledWith(0);
expect(exitCallOrder).toEqual(["lockRelease", "exit"]);
expect(writeGatewayRestartHandoffSync).not.toHaveBeenCalled();
});
});
@@ -616,6 +632,12 @@ describe("runGatewayLoop", () => {
sigusr1();
await expect(exited).resolves.toBe(0);
expect(runtime.exit).toHaveBeenCalledWith(0);
expect(writeGatewayRestartHandoffSync).toHaveBeenCalledWith({
restartKind: "full-process",
reason: undefined,
processInstanceId: expect.any(String),
supervisorMode: "launchd",
});
expect(Date.now() - startedAt).toBeGreaterThanOrEqual(1400);
});
} finally {
@@ -719,9 +741,40 @@ describe("runGatewayLoop", () => {
expect(respawnGatewayProcessForUpdate).toHaveBeenCalledTimes(1);
expect(start).toHaveBeenCalledTimes(1);
expect(markUpdateRestartSentinelFailure).not.toHaveBeenCalled();
expect(writeGatewayRestartHandoffSync).not.toHaveBeenCalled();
});
});
it("writes a handoff before exiting for supervised update restarts", async () => {
vi.clearAllMocks();
peekGatewaySigusr1RestartReason.mockReturnValue("update.run");
respawnGatewayProcessForUpdate.mockReturnValueOnce({
mode: "supervised",
});
try {
setPlatform("freebsd");
await withIsolatedSignals(async ({ captureSignal }) => {
const { runtime, exited } = await createSignaledLoopHarness();
const sigusr1 = captureSignal("SIGUSR1");
sigusr1();
await expect(exited).resolves.toBe(0);
expect(runtime.exit).toHaveBeenCalledWith(0);
expect(writeGatewayRestartHandoffSync).toHaveBeenCalledWith({
restartKind: "update-process",
reason: "update.run",
processInstanceId: expect.any(String),
supervisorMode: "external",
});
});
} finally {
if (originalPlatformDescriptor) {
Object.defineProperty(process, "platform", originalPlatformDescriptor);
}
}
});
it("probes the configured gateway host for update respawn health", async () => {
vi.clearAllMocks();
peekGatewaySigusr1RestartReason.mockReturnValue("update.run");

View File

@@ -1,3 +1,4 @@
import { randomUUID } from "node:crypto";
import net from "node:net";
import type { startGatewayServer } from "../../gateway/server.js";
import { formatErrorMessage } from "../../infra/errors.js";
@@ -94,6 +95,7 @@ export async function runGatewayLoop(params: {
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let shuttingDown = false;
let restartResolver: (() => void) | null = null;
const processInstanceId = randomUUID();
const waitForHealthyChild = params.waitForHealthyChild ?? waitForHealthyGatewayChild;
const cleanupSignals = () => {
@@ -140,6 +142,7 @@ export async function runGatewayLoop(params: {
markUpdateRestartSentinelFailure,
respawnGatewayProcessForUpdate,
restartGatewayProcessWithFreshPid,
writeGatewayRestartHandoffSync,
} = await loadGatewayLifecycleRuntimeModule();
if (isUpdateRestart) {
@@ -176,8 +179,15 @@ export async function runGatewayLoop(params: {
return;
}
if (respawn.mode === "supervised") {
const supervisorMode = detectRespawnSupervisor(process.env, process.platform);
writeGatewayRestartHandoffSync({
restartKind: "update-process",
reason: restartReason,
processInstanceId,
supervisorMode: supervisorMode ?? "external",
});
gatewayLog.info("restart mode: update process respawn (supervisor restart)");
if (detectRespawnSupervisor(process.env, process.platform) === "launchd") {
if (supervisorMode === "launchd") {
await new Promise((resolve) => {
setTimeout(resolve, LAUNCHD_SUPERVISED_RESTART_EXIT_DELAY_MS);
});
@@ -208,15 +218,24 @@ export async function runGatewayLoop(params: {
// Release the lock BEFORE spawning so the child can acquire it immediately.
const respawn = restartGatewayProcessWithFreshPid();
if (respawn.mode === "spawned" || respawn.mode === "supervised") {
const supervisorMode =
respawn.mode === "supervised"
? detectRespawnSupervisor(process.env, process.platform)
: null;
const modeLabel =
respawn.mode === "spawned"
? `spawned pid ${respawn.pid ?? "unknown"}`
: "supervisor restart";
if (respawn.mode === "supervised") {
writeGatewayRestartHandoffSync({
restartKind: "full-process",
reason: restartReason,
processInstanceId,
supervisorMode: supervisorMode ?? "external",
});
}
gatewayLog.info(`restart mode: full process restart (${modeLabel})`);
if (
respawn.mode === "supervised" &&
detectRespawnSupervisor(process.env, process.platform) === "launchd"
) {
if (supervisorMode === "launchd") {
// A short clean-exit pause keeps rapid SIGUSR1/config restarts from
// tripping launchd crash-loop throttling before KeepAlive relaunches.
await new Promise((resolve) => {