mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 07:00:44 +00:00
fix: add gateway supervisor restart handoff
This commit is contained in:
@@ -19,6 +19,7 @@ export {
|
||||
resetGatewayRestartStateForInProcessRestart,
|
||||
scheduleGatewaySigusr1Restart,
|
||||
} from "../../infra/restart.js";
|
||||
export { writeGatewayRestartHandoffSync } from "../../infra/restart-handoff.js";
|
||||
export { markUpdateRestartSentinelFailure } from "../../infra/restart-sentinel.js";
|
||||
export { detectRespawnSupervisor } from "../../infra/supervisor-markers.js";
|
||||
export { writeDiagnosticStabilityBundleForFailureSync } from "../../logging/diagnostic-stability-bundle.js";
|
||||
|
||||
@@ -14,6 +14,17 @@ const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
|
||||
const markGatewaySigusr1RestartHandled = vi.fn();
|
||||
const peekGatewaySigusr1RestartReason = vi.fn<() => string | undefined>(() => undefined);
|
||||
const resetGatewayRestartStateForInProcessRestart = vi.fn();
|
||||
const writeGatewayRestartHandoffSync = vi.fn((_opts: unknown) => ({
|
||||
kind: "gateway-supervisor-restart-handoff" as const,
|
||||
version: 1 as const,
|
||||
intentId: "test-intent",
|
||||
pid: process.pid,
|
||||
createdAt: Date.now(),
|
||||
expiresAt: Date.now() + 60_000,
|
||||
source: "unknown" as const,
|
||||
restartKind: "full-process" as const,
|
||||
supervisorMode: "external" as const,
|
||||
}));
|
||||
const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?: string }) => ({
|
||||
ok: true,
|
||||
pid: process.pid,
|
||||
@@ -107,6 +118,10 @@ vi.mock("../../infra/restart-sentinel.js", () => ({
|
||||
markUpdateRestartSentinelFailure: (reason: string) => markUpdateRestartSentinelFailure(reason),
|
||||
}));
|
||||
|
||||
vi.mock("../../infra/restart-handoff.js", () => ({
|
||||
writeGatewayRestartHandoffSync: (opts: unknown) => writeGatewayRestartHandoffSync(opts),
|
||||
}));
|
||||
|
||||
vi.mock("../../process/command-queue.js", () => ({
|
||||
getActiveTaskCount: () => getActiveTaskCount(),
|
||||
markGatewayDraining: () => markGatewayDraining(),
|
||||
@@ -595,6 +610,7 @@ describe("runGatewayLoop", () => {
|
||||
expect(lockRelease).toHaveBeenCalled();
|
||||
expect(runtime.exit).toHaveBeenCalledWith(0);
|
||||
expect(exitCallOrder).toEqual(["lockRelease", "exit"]);
|
||||
expect(writeGatewayRestartHandoffSync).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -616,6 +632,12 @@ describe("runGatewayLoop", () => {
|
||||
sigusr1();
|
||||
await expect(exited).resolves.toBe(0);
|
||||
expect(runtime.exit).toHaveBeenCalledWith(0);
|
||||
expect(writeGatewayRestartHandoffSync).toHaveBeenCalledWith({
|
||||
restartKind: "full-process",
|
||||
reason: undefined,
|
||||
processInstanceId: expect.any(String),
|
||||
supervisorMode: "launchd",
|
||||
});
|
||||
expect(Date.now() - startedAt).toBeGreaterThanOrEqual(1400);
|
||||
});
|
||||
} finally {
|
||||
@@ -719,9 +741,40 @@ describe("runGatewayLoop", () => {
|
||||
expect(respawnGatewayProcessForUpdate).toHaveBeenCalledTimes(1);
|
||||
expect(start).toHaveBeenCalledTimes(1);
|
||||
expect(markUpdateRestartSentinelFailure).not.toHaveBeenCalled();
|
||||
expect(writeGatewayRestartHandoffSync).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
it("writes a handoff before exiting for supervised update restarts", async () => {
|
||||
vi.clearAllMocks();
|
||||
peekGatewaySigusr1RestartReason.mockReturnValue("update.run");
|
||||
respawnGatewayProcessForUpdate.mockReturnValueOnce({
|
||||
mode: "supervised",
|
||||
});
|
||||
try {
|
||||
setPlatform("freebsd");
|
||||
await withIsolatedSignals(async ({ captureSignal }) => {
|
||||
const { runtime, exited } = await createSignaledLoopHarness();
|
||||
const sigusr1 = captureSignal("SIGUSR1");
|
||||
|
||||
sigusr1();
|
||||
|
||||
await expect(exited).resolves.toBe(0);
|
||||
expect(runtime.exit).toHaveBeenCalledWith(0);
|
||||
expect(writeGatewayRestartHandoffSync).toHaveBeenCalledWith({
|
||||
restartKind: "update-process",
|
||||
reason: "update.run",
|
||||
processInstanceId: expect.any(String),
|
||||
supervisorMode: "external",
|
||||
});
|
||||
});
|
||||
} finally {
|
||||
if (originalPlatformDescriptor) {
|
||||
Object.defineProperty(process, "platform", originalPlatformDescriptor);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("probes the configured gateway host for update respawn health", async () => {
|
||||
vi.clearAllMocks();
|
||||
peekGatewaySigusr1RestartReason.mockReturnValue("update.run");
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import net from "node:net";
|
||||
import type { startGatewayServer } from "../../gateway/server.js";
|
||||
import { formatErrorMessage } from "../../infra/errors.js";
|
||||
@@ -94,6 +95,7 @@ export async function runGatewayLoop(params: {
|
||||
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
||||
let shuttingDown = false;
|
||||
let restartResolver: (() => void) | null = null;
|
||||
const processInstanceId = randomUUID();
|
||||
const waitForHealthyChild = params.waitForHealthyChild ?? waitForHealthyGatewayChild;
|
||||
|
||||
const cleanupSignals = () => {
|
||||
@@ -140,6 +142,7 @@ export async function runGatewayLoop(params: {
|
||||
markUpdateRestartSentinelFailure,
|
||||
respawnGatewayProcessForUpdate,
|
||||
restartGatewayProcessWithFreshPid,
|
||||
writeGatewayRestartHandoffSync,
|
||||
} = await loadGatewayLifecycleRuntimeModule();
|
||||
|
||||
if (isUpdateRestart) {
|
||||
@@ -176,8 +179,15 @@ export async function runGatewayLoop(params: {
|
||||
return;
|
||||
}
|
||||
if (respawn.mode === "supervised") {
|
||||
const supervisorMode = detectRespawnSupervisor(process.env, process.platform);
|
||||
writeGatewayRestartHandoffSync({
|
||||
restartKind: "update-process",
|
||||
reason: restartReason,
|
||||
processInstanceId,
|
||||
supervisorMode: supervisorMode ?? "external",
|
||||
});
|
||||
gatewayLog.info("restart mode: update process respawn (supervisor restart)");
|
||||
if (detectRespawnSupervisor(process.env, process.platform) === "launchd") {
|
||||
if (supervisorMode === "launchd") {
|
||||
await new Promise((resolve) => {
|
||||
setTimeout(resolve, LAUNCHD_SUPERVISED_RESTART_EXIT_DELAY_MS);
|
||||
});
|
||||
@@ -208,15 +218,24 @@ export async function runGatewayLoop(params: {
|
||||
// Release the lock BEFORE spawning so the child can acquire it immediately.
|
||||
const respawn = restartGatewayProcessWithFreshPid();
|
||||
if (respawn.mode === "spawned" || respawn.mode === "supervised") {
|
||||
const supervisorMode =
|
||||
respawn.mode === "supervised"
|
||||
? detectRespawnSupervisor(process.env, process.platform)
|
||||
: null;
|
||||
const modeLabel =
|
||||
respawn.mode === "spawned"
|
||||
? `spawned pid ${respawn.pid ?? "unknown"}`
|
||||
: "supervisor restart";
|
||||
if (respawn.mode === "supervised") {
|
||||
writeGatewayRestartHandoffSync({
|
||||
restartKind: "full-process",
|
||||
reason: restartReason,
|
||||
processInstanceId,
|
||||
supervisorMode: supervisorMode ?? "external",
|
||||
});
|
||||
}
|
||||
gatewayLog.info(`restart mode: full process restart (${modeLabel})`);
|
||||
if (
|
||||
respawn.mode === "supervised" &&
|
||||
detectRespawnSupervisor(process.env, process.platform) === "launchd"
|
||||
) {
|
||||
if (supervisorMode === "launchd") {
|
||||
// A short clean-exit pause keeps rapid SIGUSR1/config restarts from
|
||||
// tripping launchd crash-loop throttling before KeepAlive relaunches.
|
||||
await new Promise((resolve) => {
|
||||
|
||||
Reference in New Issue
Block a user