fix(gateway): expose restart drain controls

This commit is contained in:
Vincent Koc
2026-05-02 14:43:53 -07:00
parent 624eaf5d4a
commit f6f8d74419
15 changed files with 540 additions and 41 deletions

View File

@@ -326,6 +326,27 @@ describe("runServiceRestart token drift", () => {
expect(service.restart).toHaveBeenCalledTimes(1);
});
it("writes restart force and wait options into the service-manager intent", async () => {
service.readRuntime.mockResolvedValue({ status: "running", pid: 1234 });
await runServiceRestart({
...createServiceRunArgs(),
opts: {
json: true,
restartIntent: {
waitMs: 2_500,
},
},
});
expect(writeGatewayRestartIntentSync).toHaveBeenCalledWith({
targetPid: 1234,
intent: {
waitMs: 2_500,
},
});
});
it("clears restart intent when service-manager restart fails before signaling", async () => {
service.readRuntime.mockResolvedValue({ status: "running", pid: 1234 });
writeGatewayRestartIntentSync.mockReturnValueOnce(true);

View File

@@ -13,6 +13,7 @@ import { isSystemdUserServiceAvailable } from "../../daemon/systemd.js";
import { isGatewaySecretRefUnavailableError } from "../../gateway/credentials.js";
import {
clearGatewayRestartIntentSync,
type GatewayRestartIntent,
writeGatewayRestartIntentSync,
} from "../../infra/restart.js";
import { isWSL } from "../../infra/wsl.js";
@@ -28,6 +29,9 @@ import { filterContainerGenericHints } from "./shared.js";
type DaemonLifecycleOptions = {
json?: boolean;
force?: boolean;
wait?: string;
restartIntent?: GatewayRestartIntent;
};
type RestartPostCheckContext = {
@@ -440,6 +444,7 @@ export async function runServiceRestart(params: {
const json = Boolean(params.opts?.json);
const { stdout, emit, fail } = createDaemonActionContext({ action: "restart", json });
const warnings: string[] = [];
const restartIntent = params.opts?.restartIntent;
let handledRecovery: ServiceRecoveryResult | null = null;
let recoveredLoadedState: boolean | null = null;
const emitScheduledRestart = (
@@ -552,6 +557,7 @@ export async function runServiceRestart(params: {
const runtime = await params.service.readRuntime(process.env).catch(() => null);
wroteRestartIntent = writeGatewayRestartIntentSync({
targetPid: runtime?.pid,
...(restartIntent ? { intent: restartIntent } : {}),
});
}
try {

View File

@@ -7,10 +7,12 @@ import {
formatGatewayPidList,
signalVerifiedGatewayPidSync,
} from "../../infra/gateway-processes.js";
import { type GatewayRestartIntent, writeGatewayRestartIntentSync } from "../../infra/restart.js";
import { defaultRuntime } from "../../runtime.js";
import { normalizeOptionalString } from "../../shared/string-coerce.js";
import { theme } from "../../terminal/theme.js";
import { formatCliCommand } from "../command-format.js";
import { parseDurationMs } from "../parse-duration.js";
import { recoverInstalledLaunchAgent } from "./launchd-recovery.js";
import {
runServiceRestart,
@@ -122,7 +124,25 @@ async function stopGatewayWithoutServiceManager(port: number) {
};
}
async function restartGatewayWithoutServiceManager(port: number) {
function resolveGatewayRestartIntentOptions(
opts: DaemonLifecycleOptions,
): GatewayRestartIntent | undefined {
if (opts.force && opts.wait !== undefined) {
throw new Error("--force cannot be combined with --wait");
}
if (opts.force) {
return { force: true };
}
if (opts.wait !== undefined) {
return { waitMs: parseDurationMs(String(opts.wait)) };
}
return undefined;
}
async function restartGatewayWithoutServiceManager(
port: number,
restartIntent?: GatewayRestartIntent,
) {
await assertUnmanagedGatewayRestartEnabled(port);
const pids = resolveVerifiedGatewayListenerPids(port);
if (pids.length === 0) {
@@ -133,6 +153,10 @@ async function restartGatewayWithoutServiceManager(port: number) {
`multiple gateway processes are listening on port ${port}: ${formatGatewayPidList(pids)}; use "openclaw gateway status --deep" before retrying restart`,
);
}
writeGatewayRestartIntentSync({
targetPid: pids[0],
...(restartIntent ? { intent: restartIntent } : {}),
});
signalVerifiedGatewayPidSync(pids[0], "SIGUSR1");
return {
result: "restarted" as const,
@@ -197,6 +221,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
const json = Boolean(opts.json);
const service = resolveGatewayService();
let restartedWithoutServiceManager = false;
const restartIntent = resolveGatewayRestartIntentOptions(opts);
const restartPort = await resolveGatewayLifecyclePort(service).catch(() =>
resolveGatewayPortFallback(),
);
@@ -208,7 +233,10 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
serviceNoun: "Gateway",
service,
renderStartHints: renderGatewayServiceStartHints,
opts,
opts: {
...opts,
...(restartIntent ? { restartIntent } : {}),
},
checkTokenDrift: true,
onNotLoaded: async () => {
if (process.platform === "darwin") {
@@ -217,7 +245,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
return recovered;
}
}
const handled = await restartGatewayWithoutServiceManager(restartPort);
const handled = await restartGatewayWithoutServiceManager(restartPort, restartIntent);
if (handled) {
restartedWithoutServiceManager = true;
return handled;

View File

@@ -59,6 +59,28 @@ describe("addGatewayServiceCommands", () => {
);
},
},
{
name: "forwards restart force and wait controls",
argv: ["restart", "--wait", "30s"],
assert: () => {
expect(runDaemonRestart).toHaveBeenCalledWith(
expect.objectContaining({
wait: "30s",
}),
);
},
},
{
name: "forwards restart force control",
argv: ["restart", "--force"],
assert: () => {
expect(runDaemonRestart).toHaveBeenCalledWith(
expect.objectContaining({
force: true,
}),
);
},
},
{
name: "forwards status auth collisions from parent gateway command",
argv: ["status", "--token", "tok_status", "--password", "pw_status"],

View File

@@ -1,7 +1,7 @@
import type { Command } from "commander";
import { createLazyImportLoader } from "../../shared/lazy-promise.js";
import { inheritOptionFromParent } from "../command-options.js";
import type { DaemonInstallOptions, GatewayRpcOpts } from "./types.js";
import type { DaemonInstallOptions, DaemonLifecycleOptions, GatewayRpcOpts } from "./types.js";
const daemonInstallModuleLoader = createLazyImportLoader(() => import("./install.runtime.js"));
const daemonLifecycleModuleLoader = createLazyImportLoader(() => import("./lifecycle.runtime.js"));
@@ -44,6 +44,14 @@ function resolveRpcOptions(cmdOpts: GatewayRpcOpts, command?: Command): GatewayR
};
}
function resolveRestartOptions(cmdOpts: DaemonLifecycleOptions, command?: Command) {
const parentForce = inheritOptionFromParent<boolean>(command, "force");
return {
...cmdOpts,
force: Boolean(cmdOpts.force || parentForce),
};
}
export function addGatewayServiceCommands(parent: Command, opts?: { statusDescription?: string }) {
parent
.command("status")
@@ -113,9 +121,14 @@ export function addGatewayServiceCommands(parent: Command, opts?: { statusDescri
parent
.command("restart")
.description("Restart the Gateway service (launchd/systemd/schtasks)")
.option("--force", "Restart immediately without waiting for active gateway work", false)
.option(
"--wait <duration>",
"Wait duration before forcing restart (ms, 10s, 5m; 0 waits indefinitely)",
)
.option("--json", "Output JSON", false)
.action(async (cmdOpts) => {
.action(async (cmdOpts, command) => {
const { runDaemonRestart } = await loadDaemonLifecycleModule();
await runDaemonRestart(cmdOpts);
await runDaemonRestart(resolveRestartOptions(cmdOpts, command));
});
}

View File

@@ -26,4 +26,6 @@ export type DaemonInstallOptions = {
export type DaemonLifecycleOptions = {
json?: boolean;
force?: boolean;
wait?: string;
};

View File

@@ -10,6 +10,7 @@ export {
} from "../../infra/process-respawn.js";
export {
resolveGatewayRestartDeferralTimeoutMs,
consumeGatewayRestartIntentPayloadSync,
consumeGatewayRestartIntentSync,
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
@@ -27,4 +28,5 @@ export {
resetAllLanes,
waitForActiveTasks,
} from "../../process/command-queue.js";
export { getInspectableActiveTaskRestartBlockers } from "../../tasks/task-registry.maintenance.js";
export { reloadTaskRegistryFromStore } from "../../tasks/runtime-internal.js";

View File

@@ -5,6 +5,9 @@ import { pickBeaconHost, pickGatewayPort } from "./discover.js";
const acquireGatewayLock = vi.fn(async (_opts?: { port?: number }) => ({
release: vi.fn(async () => {}),
}));
const consumeGatewayRestartIntentPayloadSync = vi.fn<
() => { force?: boolean; waitMs?: number } | null
>(() => null);
const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true);
const consumeGatewayRestartIntentSync = vi.fn(() => false);
const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
@@ -21,6 +24,17 @@ const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?
cooldownMsApplied: 0,
}));
const getActiveTaskCount = vi.fn(() => 0);
const getInspectableActiveTaskRestartBlockers = vi.fn(
() =>
[] as Array<{
taskId: string;
status: "queued" | "running";
runtime: "subagent" | "acp" | "cli" | "cron";
runId?: string;
label?: string;
title?: string;
}>,
);
const markGatewayDraining = vi.fn();
const waitForActiveTasks = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
const resetAllLanes = vi.fn();
@@ -64,6 +78,7 @@ vi.mock("../../infra/gateway-lock.js", () => ({
}));
vi.mock("../../infra/restart.js", () => ({
consumeGatewayRestartIntentPayloadSync: () => consumeGatewayRestartIntentPayloadSync(),
consumeGatewaySigusr1RestartAuthorization: () => consumeGatewaySigusr1RestartAuthorization(),
consumeGatewayRestartIntentSync: () => consumeGatewayRestartIntentSync(),
isGatewaySigusr1RestartExternallyAllowed: () => isGatewaySigusr1RestartExternallyAllowed(),
@@ -103,6 +118,10 @@ vi.mock("../../tasks/runtime-internal.js", () => ({
reloadTaskRegistryFromStore: () => reloadTaskRegistryFromStore(),
}));
vi.mock("../../tasks/task-registry.maintenance.js", () => ({
getInspectableActiveTaskRestartBlockers: () => getInspectableActiveTaskRestartBlockers(),
}));
vi.mock("../../agents/pi-embedded-runner/runs.js", () => ({
abortEmbeddedPiRun: (sessionId?: string, opts?: { mode?: "all" | "compacting" }) =>
abortEmbeddedPiRun(sessionId, opts),
@@ -270,7 +289,7 @@ describe("runGatewayLoop", () => {
it("treats SIGTERM with a restart intent as a draining restart", async () => {
vi.clearAllMocks();
consumeGatewayRestartIntentSync.mockReturnValueOnce(true);
consumeGatewayRestartIntentPayloadSync.mockReturnValueOnce({});
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
await withIsolatedSignals(async ({ captureSignal }) => {
@@ -301,7 +320,7 @@ describe("runGatewayLoop", () => {
await new Promise<void>((resolve) => setImmediate(resolve));
await new Promise<void>((resolve) => setImmediate(resolve));
expect(consumeGatewayRestartIntentSync).toHaveBeenCalledOnce();
expect(consumeGatewayRestartIntentPayloadSync).toHaveBeenCalledOnce();
expect(markGatewayDraining).toHaveBeenCalledOnce();
expect(waitForActiveTasks).toHaveBeenCalledWith(90_000);
expect(closeFirst).toHaveBeenCalledWith({
@@ -321,6 +340,68 @@ describe("runGatewayLoop", () => {
});
});
it("uses restart intent wait overrides for SIGTERM drain", async () => {
vi.clearAllMocks();
consumeGatewayRestartIntentPayloadSync.mockReturnValueOnce({ waitMs: 2_500 });
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
await withIsolatedSignals(async ({ captureSignal }) => {
const { start, exited } = await createSignaledLoopHarness();
const sigterm = captureSignal("SIGTERM");
const sigint = captureSignal("SIGINT");
sigterm();
await new Promise<void>((resolve) => setImmediate(resolve));
await new Promise<void>((resolve) => setImmediate(resolve));
expect(waitForActiveTasks).toHaveBeenCalledWith(2_500);
expect(start).toHaveBeenCalledTimes(2);
sigint();
await expect(exited).resolves.toBe(0);
});
});
it("forces SIGTERM restarts without waiting for active task drain", async () => {
vi.clearAllMocks();
consumeGatewayRestartIntentPayloadSync.mockReturnValueOnce({ force: true });
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
getActiveEmbeddedRunCount.mockReturnValueOnce(1).mockReturnValue(0);
getInspectableActiveTaskRestartBlockers.mockReturnValueOnce([
{
taskId: "task-force",
runId: "run-force",
status: "running",
runtime: "cron",
label: "forced",
},
]);
await withIsolatedSignals(async ({ captureSignal }) => {
const { start, exited } = await createSignaledLoopHarness();
const sigterm = captureSignal("SIGTERM");
const sigint = captureSignal("SIGINT");
sigterm();
await new Promise<void>((resolve) => setImmediate(resolve));
await new Promise<void>((resolve) => setImmediate(resolve));
expect(waitForActiveTasks).not.toHaveBeenCalled();
expect(waitForActiveEmbeddedRuns).not.toHaveBeenCalled();
expect(abortEmbeddedPiRun).toHaveBeenCalledWith(undefined, { mode: "all" });
expect(gatewayLog.warn).toHaveBeenCalledWith(
expect.stringContaining("restart blocked by active task run(s): taskId=task-force"),
);
expect(gatewayLog.warn).toHaveBeenCalledWith(
"forced restart requested; skipping active work drain",
);
expect(start).toHaveBeenCalledTimes(2);
sigint();
await expect(exited).resolves.toBe(0);
});
});
it("restarts after SIGUSR1 even when drain times out, and resets runtime state for the new iteration", async () => {
vi.clearAllMocks();
loadConfig.mockReturnValue({

View File

@@ -15,6 +15,10 @@ const UPDATE_RESPAWN_HEALTH_POLL_MS = 200;
type GatewayRunSignalAction = "stop" | "restart";
type RestartDrainTimeoutMs = number | undefined;
type RestartIntentOptions = {
force?: boolean;
waitMs?: number;
};
type GatewayLifecycleRuntimeModule = typeof import("./lifecycle.runtime.js");
@@ -245,7 +249,15 @@ export async function runGatewayLoop(params: {
const SUPERVISOR_STOP_TIMEOUT_MS = 30_000;
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
const resolveRestartDrainTimeoutMs = async (
restartIntent?: RestartIntentOptions,
): Promise<RestartDrainTimeoutMs> => {
if (restartIntent?.force) {
return 0;
}
if (typeof restartIntent?.waitMs === "number" && Number.isFinite(restartIntent.waitMs)) {
return restartIntent.waitMs > 0 ? Math.floor(restartIntent.waitMs) : undefined;
}
try {
const { getRuntimeConfig, resolveGatewayRestartDeferralTimeoutMs } =
await loadGatewayLifecycleRuntimeModule();
@@ -256,7 +268,12 @@ export async function runGatewayLoop(params: {
}
};
const request = (action: GatewayRunSignalAction, signal: string, restartReason?: string) => {
const request = (
action: GatewayRunSignalAction,
signal: string,
restartReason?: string,
restartIntent?: RestartIntentOptions,
) => {
if (shuttingDown) {
gatewayLog.info(`received ${signal} during shutdown; ignoring`);
return;
@@ -295,7 +312,9 @@ export async function runGatewayLoop(params: {
};
void (async () => {
const restartDrainTimeoutMs = isRestart ? await resolveRestartDrainTimeoutMs() : 0;
const restartDrainTimeoutMs = isRestart
? await resolveRestartDrainTimeoutMs(restartIntent)
: 0;
if (!isRestart) {
armForceExitTimer(SHUTDOWN_TIMEOUT_MS);
} else if (restartDrainTimeoutMs !== undefined) {
@@ -319,12 +338,35 @@ export async function runGatewayLoop(params: {
if (isRestart) {
const {
abortEmbeddedPiRun,
getInspectableActiveTaskRestartBlockers,
getActiveEmbeddedRunCount,
getActiveTaskCount,
markGatewayDraining,
waitForActiveEmbeddedRuns,
waitForActiveTasks,
} = await loadGatewayLifecycleRuntimeModule();
const formatTaskBlockers = () => {
const blockers = getInspectableActiveTaskRestartBlockers();
if (blockers.length === 0) {
return null;
}
const shown = blockers
.slice(0, 8)
.map((task) =>
[
`taskId=${task.taskId}`,
task.runId ? `runId=${task.runId}` : null,
`status=${task.status}`,
`runtime=${task.runtime}`,
task.label ? `label=${task.label}` : null,
task.title ? `title=${task.title.slice(0, 80)}` : null,
]
.filter((value): value is string => Boolean(value))
.join(" "),
);
const omitted = blockers.length - shown.length;
return omitted > 0 ? `${shown.join("; ")}; +${omitted} more` : shown.join("; ");
};
const createStillPendingDrainLogger = () =>
setInterval(() => {
gatewayLog.warn(
@@ -345,25 +387,34 @@ export async function runGatewayLoop(params: {
}
if (activeTasks > 0 || activeRuns > 0) {
const taskBlockers = formatTaskBlockers();
gatewayLog.info(
`draining ${activeTasks} active task(s) and ${activeRuns} active embedded run(s) before restart ${formatRestartDrainBudget()}`,
);
const stillPendingDrainLogger = createStillPendingDrainLogger();
const [tasksDrain, runsDrain] = await Promise.all([
activeTasks > 0
? waitForActiveTasks(restartDrainTimeoutMs)
: Promise.resolve({ drained: true }),
activeRuns > 0
? waitForActiveEmbeddedRuns(restartDrainTimeoutMs)
: Promise.resolve({ drained: true }),
]).finally(() => clearInterval(stillPendingDrainLogger));
if (tasksDrain.drained && runsDrain.drained) {
gatewayLog.info("all active work drained");
} else {
gatewayLog.warn("drain timeout reached; proceeding with restart");
// Final best-effort abort to avoid carrying active runs into the
// next lifecycle when drain time budget is exhausted.
if (taskBlockers) {
gatewayLog.warn(`restart blocked by active task run(s): ${taskBlockers}`);
}
if (restartIntent?.force) {
gatewayLog.warn("forced restart requested; skipping active work drain");
abortEmbeddedPiRun(undefined, { mode: "all" });
} else {
const stillPendingDrainLogger = createStillPendingDrainLogger();
const [tasksDrain, runsDrain] = await Promise.all([
activeTasks > 0
? waitForActiveTasks(restartDrainTimeoutMs)
: Promise.resolve({ drained: true }),
activeRuns > 0
? waitForActiveEmbeddedRuns(restartDrainTimeoutMs)
: Promise.resolve({ drained: true }),
]).finally(() => clearInterval(stillPendingDrainLogger));
if (tasksDrain.drained && runsDrain.drained) {
gatewayLog.info("all active work drained");
} else {
gatewayLog.warn("drain timeout reached; proceeding with restart");
// Final best-effort abort to avoid carrying active runs into the
// next lifecycle when drain time budget is exhausted.
abortEmbeddedPiRun(undefined, { mode: "all" });
}
}
}
}
@@ -390,8 +441,9 @@ export async function runGatewayLoop(params: {
const onSigterm = () => {
gatewayLog.info("signal SIGTERM received");
void (async () => {
const { consumeGatewayRestartIntentSync } = await loadGatewayLifecycleRuntimeModule();
request(consumeGatewayRestartIntentSync() ? "restart" : "stop", "SIGTERM");
const { consumeGatewayRestartIntentPayloadSync } = await loadGatewayLifecycleRuntimeModule();
const restartIntent = consumeGatewayRestartIntentPayloadSync();
request(restartIntent ? "restart" : "stop", "SIGTERM", undefined, restartIntent ?? undefined);
})();
};
const onSigint = () => {
@@ -402,12 +454,18 @@ export async function runGatewayLoop(params: {
gatewayLog.info("signal SIGUSR1 received");
void (async () => {
const {
consumeGatewayRestartIntentPayloadSync,
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
markGatewaySigusr1RestartHandled,
peekGatewaySigusr1RestartReason,
scheduleGatewaySigusr1Restart,
} = await loadGatewayLifecycleRuntimeModule();
const restartIntent = consumeGatewayRestartIntentPayloadSync();
if (restartIntent) {
request("restart", "SIGUSR1", "gateway.restart", restartIntent);
return;
}
const authorized = consumeGatewaySigusr1RestartAuthorization();
if (!authorized) {
if (!isGatewaySigusr1RestartExternallyAllowed()) {