diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 0afdb43e9fa..6e36959297e 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json -6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json -0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json +7f9815a297504c75022c4db2df250ce4cc9ff5c3f69250c67ca253b89148b9f3 config-baseline.json +8bc9fda7c1096472beaa416a61043ce51d691d4dcad9ed3e0be46e68bb70b0ce config-baseline.core.json +45162ff84813be8a1fe561ed8d6245a248d5c6288ef9e9af51bdf4ec05ef65ad config-baseline.channel.json 0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index a687a308a81..385336a5143 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -515,7 +515,7 @@ See [Multiple Gateways](/gateway/multiple-gateways). reload: { mode: "hybrid", // off | restart | hot | hybrid debounceMs: 500, - deferralTimeoutMs: 0, + deferralTimeoutMs: 300000, }, }, } @@ -527,7 +527,7 @@ See [Multiple Gateways](/gateway/multiple-gateways). - `"hot"`: apply changes in-process without restarting. - `"hybrid"` (default): try hot reload first; fall back to restart if required. - `debounceMs`: debounce window in ms before config changes are applied (non-negative integer). -- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it or set `0` to wait indefinitely and log periodic still-pending warnings. +- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it to use the default bounded wait (`300000`); set `0` to wait indefinitely and log periodic still-pending warnings. --- diff --git a/src/cli/gateway-cli/run-loop.test.ts b/src/cli/gateway-cli/run-loop.test.ts index 02e3198d0be..dc86047cef6 100644 --- a/src/cli/gateway-cli/run-loop.test.ts +++ b/src/cli/gateway-cli/run-loop.test.ts @@ -50,7 +50,8 @@ const abortEmbeddedPiRun = vi.fn( const getActiveEmbeddedRunCount = vi.fn(() => 0); const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true })); const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart"; -const loadConfig = vi.fn(() => ({ +const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000; +const loadConfig = vi.fn<() => { gateway: { reload: { deferralTimeoutMs?: number } } }>(() => ({ gateway: { reload: { deferralTimeoutMs: 90_000, @@ -74,6 +75,15 @@ vi.mock("../../infra/restart.js", () => ({ markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(), peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(), resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(), + resolveGatewayRestartDeferralTimeoutMs: (timeoutMs: unknown) => { + if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) { + return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS; + } + if (timeoutMs <= 0) { + return undefined; + } + return Math.floor(timeoutMs); + }, scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) => scheduleGatewaySigusr1Restart(opts), })); @@ -438,6 +448,31 @@ describe("runGatewayLoop", () => { }); }); + it("uses the default restart drain timeout when config omits deferralTimeoutMs", async () => { + vi.clearAllMocks(); + loadConfig.mockReturnValue({ gateway: { reload: {} } }); + peekGatewaySigusr1RestartReason.mockReturnValue(undefined); + respawnGatewayProcessForUpdate.mockReturnValue({ + mode: "disabled", + detail: "OPENCLAW_NO_RESPAWN", + }); + + await withIsolatedSignals(async ({ captureSignal }) => { + getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0); + + const { start } = await createSignaledLoopHarness(); + const sigusr1 = captureSignal("SIGUSR1"); + + sigusr1(); + await new Promise((resolve) => setImmediate(resolve)); + await new Promise((resolve) => setImmediate(resolve)); + + expect(waitForActiveTasks).toHaveBeenCalledWith(DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS); + expect(markGatewayDraining).toHaveBeenCalledOnce(); + expect(start).toHaveBeenCalledTimes(2); + }); + }); + it("routes external SIGUSR1 through the restart scheduler before draining", async () => { vi.clearAllMocks(); consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false); diff --git a/src/cli/gateway-cli/run-loop.ts b/src/cli/gateway-cli/run-loop.ts index 1e614838e83..3eb7288cd9f 100644 --- a/src/cli/gateway-cli/run-loop.ts +++ b/src/cli/gateway-cli/run-loop.ts @@ -279,11 +279,12 @@ export async function runGatewayLoop(params: { const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000; const resolveRestartDrainTimeoutMs = async (): Promise => { try { - const { getRuntimeConfig } = await loadRuntimeConfigModule(); + const [{ getRuntimeConfig }, { resolveGatewayRestartDeferralTimeoutMs }] = await Promise.all([ + loadRuntimeConfigModule(), + loadRestartModule(), + ]); const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs; - return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) && timeoutMs > 0 - ? timeoutMs - : undefined; + return resolveGatewayRestartDeferralTimeoutMs(timeoutMs); } catch { return DEFAULT_RESTART_DRAIN_TIMEOUT_MS; } diff --git a/src/config/schema.base.generated.ts b/src/config/schema.base.generated.ts index 2459f2ebfe0..ef7f1b6982d 100644 --- a/src/config/schema.base.generated.ts +++ b/src/config/schema.base.generated.ts @@ -22632,7 +22632,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { maximum: 9007199254740991, title: "Restart Deferral Timeout (ms)", description: - "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", + "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", }, }, additionalProperties: false, @@ -25803,7 +25803,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = { }, "gateway.reload.deferralTimeoutMs": { label: "Restart Deferral Timeout (ms)", - help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", + help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", tags: ["network", "reliability", "performance"], }, "gateway.nodes.browser.mode": { diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 09bc54cac22..8024365a134 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -492,7 +492,7 @@ export const FIELD_HELP: Record = { 'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.', "gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.", "gateway.reload.deferralTimeoutMs": - "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", + "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.", "gateway.nodes.browser.mode": 'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).', "gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).", diff --git a/src/config/types.gateway.ts b/src/config/types.gateway.ts index 5535d0e2e6d..72c5242f60f 100644 --- a/src/config/types.gateway.ts +++ b/src/config/types.gateway.ts @@ -215,8 +215,8 @@ export type GatewayReloadConfig = { debounceMs?: number; /** * Optional maximum time (ms) to wait for in-flight operations to complete - * before forcing a restart. Absent or 0 waits indefinitely and logs periodic - * still-pending warnings. + * before forcing a restart. Absent uses the gateway's default bounded wait; + * 0 waits indefinitely and logs periodic still-pending warnings. * Lower positive values risk aborting active subagent LLM calls. * @see https://github.com/openclaw/openclaw/issues/65485 */ diff --git a/src/gateway/server-reload-handlers.ts b/src/gateway/server-reload-handlers.ts index c0faf00cffa..2864fa76b9a 100644 --- a/src/gateway/server-reload-handlers.ts +++ b/src/gateway/server-reload-handlers.ts @@ -12,6 +12,7 @@ import { resetDirectoryCache } from "../infra/outbound/target-resolver.js"; import { deferGatewayRestartUntilIdle, emitGatewayRestart, + resolveGatewayRestartDeferralTimeoutMs, setGatewaySigusr1RestartPolicy, } from "../infra/restart.js"; import { getTotalQueueSize } from "../process/command-queue.js"; @@ -363,7 +364,9 @@ export function createGatewayReloadHandlers(params: GatewayReloadHandlerParams) deferGatewayRestartUntilIdle({ getPendingCount: () => getActiveCounts().totalActive, - maxWaitMs: nextConfig.gateway?.reload?.deferralTimeoutMs, + maxWaitMs: resolveGatewayRestartDeferralTimeoutMs( + nextConfig.gateway?.reload?.deferralTimeoutMs, + ), hooks: { onReady: () => { restartPending = false; diff --git a/src/gateway/server.reload.test.ts b/src/gateway/server.reload.test.ts index 05f68017290..217a81d50f0 100644 --- a/src/gateway/server.reload.test.ts +++ b/src/gateway/server.reload.test.ts @@ -727,6 +727,50 @@ describe("gateway hot reload", () => { }); }); + it("uses the default restart deferral timeout when config omits deferralTimeoutMs", async () => { + await withNonMinimalGatewayServer(async () => { + const onRestart = hoisted.getOnRestart(); + expect(onRestart).toBeTypeOf("function"); + + const restartTesting = (await import("../infra/restart.js")).__testing; + restartTesting.resetSigusr1State(); + hoisted.activeTaskCount.value = 1; + const signalSpy = vi.fn(); + process.once("SIGUSR1", signalSpy); + vi.useFakeTimers(); + + try { + onRestart?.( + { + changedPaths: ["gateway.port"], + restartGateway: true, + restartReasons: ["gateway.port"], + hotReasons: [], + reloadHooks: false, + restartGmailWatcher: false, + restartCron: false, + restartHeartbeat: false, + restartChannels: new Set(), + noopPaths: [], + }, + {}, + ); + + await vi.advanceTimersByTimeAsync(299_500); + expect(signalSpy).not.toHaveBeenCalled(); + + await vi.advanceTimersByTimeAsync(500); + await Promise.resolve(); + expect(signalSpy).toHaveBeenCalledTimes(1); + } finally { + hoisted.activeTaskCount.value = 0; + vi.useRealTimers(); + process.removeListener("SIGUSR1", signalSpy); + restartTesting.resetSigusr1State(); + } + }); + }); + it("emits one-shot degraded and recovered system events during secret reload transitions", async () => { await writeEnvRefConfig(); process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret diff --git a/src/infra/infra-runtime.test.ts b/src/infra/infra-runtime.test.ts index 90188508482..3b508192b2a 100644 --- a/src/infra/infra-runtime.test.ts +++ b/src/infra/infra-runtime.test.ts @@ -483,7 +483,7 @@ describe("infra runtime", () => { } }); - it("keeps SIGUSR1 deferred by default while work is still pending", async () => { + it("emits SIGUSR1 after the default deferral timeout while work is still pending", async () => { const emitSpy = vi.spyOn(process, "emit"); const handler = () => {}; process.on("SIGUSR1", handler); @@ -495,8 +495,25 @@ describe("infra runtime", () => { await vi.advanceTimersByTimeAsync(0); expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); - // No default max deferral wait; active turns should not be killed just - // because a config-triggered restart has been pending for 5 minutes. + await vi.advanceTimersByTimeAsync(300_000); + expect(emitSpy).toHaveBeenCalledWith("SIGUSR1"); + } finally { + process.removeListener("SIGUSR1", handler); + } + }); + + it("keeps SIGUSR1 deferred when deferral timeout is explicitly disabled", async () => { + const emitSpy = vi.spyOn(process, "emit"); + const handler = () => {}; + process.on("SIGUSR1", handler); + try { + setRuntimeConfigSnapshot({ gateway: { reload: { deferralTimeoutMs: 0 } } }); + setPreRestartDeferralCheck(() => 5); // always pending + scheduleGatewaySigusr1Restart({ delayMs: 0 }); + + await vi.advanceTimersByTimeAsync(0); + expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); + await vi.advanceTimersByTimeAsync(300_000); expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); } finally { diff --git a/src/infra/restart.ts b/src/infra/restart.ts index 5b32de994a9..56b6a906f66 100644 --- a/src/infra/restart.ts +++ b/src/infra/restart.ts @@ -20,6 +20,7 @@ const SPAWN_TIMEOUT_MS = 2000; const SIGUSR1_AUTH_GRACE_MS = 5000; const DEFAULT_DEFERRAL_POLL_MS = 500; const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000; +export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000; const RESTART_COOLDOWN_MS = 30_000; const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37; const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json"; @@ -368,6 +369,16 @@ export type RestartEmitHooks = { afterEmitRejected?: () => Promise; }; +export function resolveGatewayRestartDeferralTimeoutMs(timeoutMs: unknown): number | undefined { + if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) { + return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS; + } + if (timeoutMs <= 0) { + return undefined; + } + return Math.floor(timeoutMs); +} + function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void { if (hooks) { pendingRestartEmitHooks = hooks; @@ -729,7 +740,7 @@ export function scheduleGatewaySigusr1Restart(opts?: { const cfg = getRuntimeConfig(); deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck, - maxWaitMs: cfg.gateway?.reload?.deferralTimeoutMs, + maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(cfg.gateway?.reload?.deferralTimeoutMs), reason: scheduledReason, }); },