fix(gateway): bound default restart deferral

This commit is contained in:
Vincent Koc
2026-04-28 18:42:21 -07:00
parent 7e5c3753f6
commit 1f41b8b44b
11 changed files with 131 additions and 20 deletions

View File

@@ -1,4 +1,4 @@
d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json 7f9815a297504c75022c4db2df250ce4cc9ff5c3f69250c67ca253b89148b9f3 config-baseline.json
6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json 8bc9fda7c1096472beaa416a61043ce51d691d4dcad9ed3e0be46e68bb70b0ce config-baseline.core.json
0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json 45162ff84813be8a1fe561ed8d6245a248d5c6288ef9e9af51bdf4ec05ef65ad config-baseline.channel.json
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json 0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json

View File

@@ -515,7 +515,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
reload: { reload: {
mode: "hybrid", // off | restart | hot | hybrid mode: "hybrid", // off | restart | hot | hybrid
debounceMs: 500, debounceMs: 500,
deferralTimeoutMs: 0, deferralTimeoutMs: 300000,
}, },
}, },
} }
@@ -527,7 +527,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
- `"hot"`: apply changes in-process without restarting. - `"hot"`: apply changes in-process without restarting.
- `"hybrid"` (default): try hot reload first; fall back to restart if required. - `"hybrid"` (default): try hot reload first; fall back to restart if required.
- `debounceMs`: debounce window in ms before config changes are applied (non-negative integer). - `debounceMs`: debounce window in ms before config changes are applied (non-negative integer).
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it or set `0` to wait indefinitely and log periodic still-pending warnings. - `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it to use the default bounded wait (`300000`); set `0` to wait indefinitely and log periodic still-pending warnings.
--- ---

View File

@@ -50,7 +50,8 @@ const abortEmbeddedPiRun = vi.fn(
const getActiveEmbeddedRunCount = vi.fn(() => 0); const getActiveEmbeddedRunCount = vi.fn(() => 0);
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true })); const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart"; const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
const loadConfig = vi.fn(() => ({ const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
const loadConfig = vi.fn<() => { gateway: { reload: { deferralTimeoutMs?: number } } }>(() => ({
gateway: { gateway: {
reload: { reload: {
deferralTimeoutMs: 90_000, deferralTimeoutMs: 90_000,
@@ -74,6 +75,15 @@ vi.mock("../../infra/restart.js", () => ({
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(), markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(), peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(),
resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(), resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(),
resolveGatewayRestartDeferralTimeoutMs: (timeoutMs: unknown) => {
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
}
if (timeoutMs <= 0) {
return undefined;
}
return Math.floor(timeoutMs);
},
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) => scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
scheduleGatewaySigusr1Restart(opts), scheduleGatewaySigusr1Restart(opts),
})); }));
@@ -438,6 +448,31 @@ describe("runGatewayLoop", () => {
}); });
}); });
it("uses the default restart drain timeout when config omits deferralTimeoutMs", async () => {
vi.clearAllMocks();
loadConfig.mockReturnValue({ gateway: { reload: {} } });
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);
respawnGatewayProcessForUpdate.mockReturnValue({
mode: "disabled",
detail: "OPENCLAW_NO_RESPAWN",
});
await withIsolatedSignals(async ({ captureSignal }) => {
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
const { start } = await createSignaledLoopHarness();
const sigusr1 = captureSignal("SIGUSR1");
sigusr1();
await new Promise<void>((resolve) => setImmediate(resolve));
await new Promise<void>((resolve) => setImmediate(resolve));
expect(waitForActiveTasks).toHaveBeenCalledWith(DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS);
expect(markGatewayDraining).toHaveBeenCalledOnce();
expect(start).toHaveBeenCalledTimes(2);
});
});
it("routes external SIGUSR1 through the restart scheduler before draining", async () => { it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
vi.clearAllMocks(); vi.clearAllMocks();
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false); consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);

View File

@@ -279,11 +279,12 @@ export async function runGatewayLoop(params: {
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000; const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => { const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
try { try {
const { getRuntimeConfig } = await loadRuntimeConfigModule(); const [{ getRuntimeConfig }, { resolveGatewayRestartDeferralTimeoutMs }] = await Promise.all([
loadRuntimeConfigModule(),
loadRestartModule(),
]);
const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs; const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs;
return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) && timeoutMs > 0 return resolveGatewayRestartDeferralTimeoutMs(timeoutMs);
? timeoutMs
: undefined;
} catch { } catch {
return DEFAULT_RESTART_DRAIN_TIMEOUT_MS; return DEFAULT_RESTART_DRAIN_TIMEOUT_MS;
} }

View File

@@ -22632,7 +22632,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
maximum: 9007199254740991, maximum: 9007199254740991,
title: "Restart Deferral Timeout (ms)", title: "Restart Deferral Timeout (ms)",
description: description:
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
}, },
}, },
additionalProperties: false, additionalProperties: false,
@@ -25803,7 +25803,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
}, },
"gateway.reload.deferralTimeoutMs": { "gateway.reload.deferralTimeoutMs": {
label: "Restart Deferral Timeout (ms)", label: "Restart Deferral Timeout (ms)",
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
tags: ["network", "reliability", "performance"], tags: ["network", "reliability", "performance"],
}, },
"gateway.nodes.browser.mode": { "gateway.nodes.browser.mode": {

View File

@@ -492,7 +492,7 @@ export const FIELD_HELP: Record<string, string> = {
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.', 'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.", "gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",
"gateway.reload.deferralTimeoutMs": "gateway.reload.deferralTimeoutMs":
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
"gateway.nodes.browser.mode": "gateway.nodes.browser.mode":
'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).', 'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).',
"gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).", "gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).",

View File

@@ -215,8 +215,8 @@ export type GatewayReloadConfig = {
debounceMs?: number; debounceMs?: number;
/** /**
* Optional maximum time (ms) to wait for in-flight operations to complete * Optional maximum time (ms) to wait for in-flight operations to complete
* before forcing a restart. Absent or 0 waits indefinitely and logs periodic * before forcing a restart. Absent uses the gateway's default bounded wait;
* still-pending warnings. * 0 waits indefinitely and logs periodic still-pending warnings.
* Lower positive values risk aborting active subagent LLM calls. * Lower positive values risk aborting active subagent LLM calls.
* @see https://github.com/openclaw/openclaw/issues/65485 * @see https://github.com/openclaw/openclaw/issues/65485
*/ */

View File

@@ -12,6 +12,7 @@ import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
import { import {
deferGatewayRestartUntilIdle, deferGatewayRestartUntilIdle,
emitGatewayRestart, emitGatewayRestart,
resolveGatewayRestartDeferralTimeoutMs,
setGatewaySigusr1RestartPolicy, setGatewaySigusr1RestartPolicy,
} from "../infra/restart.js"; } from "../infra/restart.js";
import { getTotalQueueSize } from "../process/command-queue.js"; import { getTotalQueueSize } from "../process/command-queue.js";
@@ -363,7 +364,9 @@ export function createGatewayReloadHandlers(params: GatewayReloadHandlerParams)
deferGatewayRestartUntilIdle({ deferGatewayRestartUntilIdle({
getPendingCount: () => getActiveCounts().totalActive, getPendingCount: () => getActiveCounts().totalActive,
maxWaitMs: nextConfig.gateway?.reload?.deferralTimeoutMs, maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(
nextConfig.gateway?.reload?.deferralTimeoutMs,
),
hooks: { hooks: {
onReady: () => { onReady: () => {
restartPending = false; restartPending = false;

View File

@@ -727,6 +727,50 @@ describe("gateway hot reload", () => {
}); });
}); });
it("uses the default restart deferral timeout when config omits deferralTimeoutMs", async () => {
await withNonMinimalGatewayServer(async () => {
const onRestart = hoisted.getOnRestart();
expect(onRestart).toBeTypeOf("function");
const restartTesting = (await import("../infra/restart.js")).__testing;
restartTesting.resetSigusr1State();
hoisted.activeTaskCount.value = 1;
const signalSpy = vi.fn();
process.once("SIGUSR1", signalSpy);
vi.useFakeTimers();
try {
onRestart?.(
{
changedPaths: ["gateway.port"],
restartGateway: true,
restartReasons: ["gateway.port"],
hotReasons: [],
reloadHooks: false,
restartGmailWatcher: false,
restartCron: false,
restartHeartbeat: false,
restartChannels: new Set(),
noopPaths: [],
},
{},
);
await vi.advanceTimersByTimeAsync(299_500);
expect(signalSpy).not.toHaveBeenCalled();
await vi.advanceTimersByTimeAsync(500);
await Promise.resolve();
expect(signalSpy).toHaveBeenCalledTimes(1);
} finally {
hoisted.activeTaskCount.value = 0;
vi.useRealTimers();
process.removeListener("SIGUSR1", signalSpy);
restartTesting.resetSigusr1State();
}
});
});
it("emits one-shot degraded and recovered system events during secret reload transitions", async () => { it("emits one-shot degraded and recovered system events during secret reload transitions", async () => {
await writeEnvRefConfig(); await writeEnvRefConfig();
process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret

View File

@@ -483,7 +483,7 @@ describe("infra runtime", () => {
} }
}); });
it("keeps SIGUSR1 deferred by default while work is still pending", async () => { it("emits SIGUSR1 after the default deferral timeout while work is still pending", async () => {
const emitSpy = vi.spyOn(process, "emit"); const emitSpy = vi.spyOn(process, "emit");
const handler = () => {}; const handler = () => {};
process.on("SIGUSR1", handler); process.on("SIGUSR1", handler);
@@ -495,8 +495,25 @@ describe("infra runtime", () => {
await vi.advanceTimersByTimeAsync(0); await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
// No default max deferral wait; active turns should not be killed just await vi.advanceTimersByTimeAsync(300_000);
// because a config-triggered restart has been pending for 5 minutes. expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("keeps SIGUSR1 deferred when deferral timeout is explicitly disabled", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
setRuntimeConfigSnapshot({ gateway: { reload: { deferralTimeoutMs: 0 } } });
setPreRestartDeferralCheck(() => 5); // always pending
scheduleGatewaySigusr1Restart({ delayMs: 0 });
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
await vi.advanceTimersByTimeAsync(300_000); await vi.advanceTimersByTimeAsync(300_000);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
} finally { } finally {

View File

@@ -20,6 +20,7 @@ const SPAWN_TIMEOUT_MS = 2000;
const SIGUSR1_AUTH_GRACE_MS = 5000; const SIGUSR1_AUTH_GRACE_MS = 5000;
const DEFAULT_DEFERRAL_POLL_MS = 500; const DEFAULT_DEFERRAL_POLL_MS = 500;
const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000; const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000;
export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
const RESTART_COOLDOWN_MS = 30_000; const RESTART_COOLDOWN_MS = 30_000;
const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37; const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37;
const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json"; const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json";
@@ -368,6 +369,16 @@ export type RestartEmitHooks = {
afterEmitRejected?: () => Promise<void>; afterEmitRejected?: () => Promise<void>;
}; };
export function resolveGatewayRestartDeferralTimeoutMs(timeoutMs: unknown): number | undefined {
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
}
if (timeoutMs <= 0) {
return undefined;
}
return Math.floor(timeoutMs);
}
function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void { function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void {
if (hooks) { if (hooks) {
pendingRestartEmitHooks = hooks; pendingRestartEmitHooks = hooks;
@@ -729,7 +740,7 @@ export function scheduleGatewaySigusr1Restart(opts?: {
const cfg = getRuntimeConfig(); const cfg = getRuntimeConfig();
deferGatewayRestartUntilIdle({ deferGatewayRestartUntilIdle({
getPendingCount: pendingCheck, getPendingCount: pendingCheck,
maxWaitMs: cfg.gateway?.reload?.deferralTimeoutMs, maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(cfg.gateway?.reload?.deferralTimeoutMs),
reason: scheduledReason, reason: scheduledReason,
}); });
}, },