mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 05:30:42 +00:00
fix(gateway): bound default restart deferral
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json
|
||||
6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json
|
||||
0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json
|
||||
7f9815a297504c75022c4db2df250ce4cc9ff5c3f69250c67ca253b89148b9f3 config-baseline.json
|
||||
8bc9fda7c1096472beaa416a61043ce51d691d4dcad9ed3e0be46e68bb70b0ce config-baseline.core.json
|
||||
45162ff84813be8a1fe561ed8d6245a248d5c6288ef9e9af51bdf4ec05ef65ad config-baseline.channel.json
|
||||
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json
|
||||
|
||||
@@ -515,7 +515,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
|
||||
reload: {
|
||||
mode: "hybrid", // off | restart | hot | hybrid
|
||||
debounceMs: 500,
|
||||
deferralTimeoutMs: 0,
|
||||
deferralTimeoutMs: 300000,
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -527,7 +527,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
|
||||
- `"hot"`: apply changes in-process without restarting.
|
||||
- `"hybrid"` (default): try hot reload first; fall back to restart if required.
|
||||
- `debounceMs`: debounce window in ms before config changes are applied (non-negative integer).
|
||||
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it or set `0` to wait indefinitely and log periodic still-pending warnings.
|
||||
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it to use the default bounded wait (`300000`); set `0` to wait indefinitely and log periodic still-pending warnings.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -50,7 +50,8 @@ const abortEmbeddedPiRun = vi.fn(
|
||||
const getActiveEmbeddedRunCount = vi.fn(() => 0);
|
||||
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
|
||||
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
|
||||
const loadConfig = vi.fn(() => ({
|
||||
const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
|
||||
const loadConfig = vi.fn<() => { gateway: { reload: { deferralTimeoutMs?: number } } }>(() => ({
|
||||
gateway: {
|
||||
reload: {
|
||||
deferralTimeoutMs: 90_000,
|
||||
@@ -74,6 +75,15 @@ vi.mock("../../infra/restart.js", () => ({
|
||||
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
|
||||
peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(),
|
||||
resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(),
|
||||
resolveGatewayRestartDeferralTimeoutMs: (timeoutMs: unknown) => {
|
||||
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
|
||||
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
|
||||
}
|
||||
if (timeoutMs <= 0) {
|
||||
return undefined;
|
||||
}
|
||||
return Math.floor(timeoutMs);
|
||||
},
|
||||
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
|
||||
scheduleGatewaySigusr1Restart(opts),
|
||||
}));
|
||||
@@ -438,6 +448,31 @@ describe("runGatewayLoop", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("uses the default restart drain timeout when config omits deferralTimeoutMs", async () => {
|
||||
vi.clearAllMocks();
|
||||
loadConfig.mockReturnValue({ gateway: { reload: {} } });
|
||||
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);
|
||||
respawnGatewayProcessForUpdate.mockReturnValue({
|
||||
mode: "disabled",
|
||||
detail: "OPENCLAW_NO_RESPAWN",
|
||||
});
|
||||
|
||||
await withIsolatedSignals(async ({ captureSignal }) => {
|
||||
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
|
||||
|
||||
const { start } = await createSignaledLoopHarness();
|
||||
const sigusr1 = captureSignal("SIGUSR1");
|
||||
|
||||
sigusr1();
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
|
||||
expect(waitForActiveTasks).toHaveBeenCalledWith(DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS);
|
||||
expect(markGatewayDraining).toHaveBeenCalledOnce();
|
||||
expect(start).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
|
||||
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
|
||||
vi.clearAllMocks();
|
||||
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
|
||||
|
||||
@@ -279,11 +279,12 @@ export async function runGatewayLoop(params: {
|
||||
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
|
||||
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
|
||||
try {
|
||||
const { getRuntimeConfig } = await loadRuntimeConfigModule();
|
||||
const [{ getRuntimeConfig }, { resolveGatewayRestartDeferralTimeoutMs }] = await Promise.all([
|
||||
loadRuntimeConfigModule(),
|
||||
loadRestartModule(),
|
||||
]);
|
||||
const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs;
|
||||
return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) && timeoutMs > 0
|
||||
? timeoutMs
|
||||
: undefined;
|
||||
return resolveGatewayRestartDeferralTimeoutMs(timeoutMs);
|
||||
} catch {
|
||||
return DEFAULT_RESTART_DRAIN_TIMEOUT_MS;
|
||||
}
|
||||
|
||||
@@ -22632,7 +22632,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
maximum: 9007199254740991,
|
||||
title: "Restart Deferral Timeout (ms)",
|
||||
description:
|
||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
},
|
||||
},
|
||||
additionalProperties: false,
|
||||
@@ -25803,7 +25803,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
||||
},
|
||||
"gateway.reload.deferralTimeoutMs": {
|
||||
label: "Restart Deferral Timeout (ms)",
|
||||
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
tags: ["network", "reliability", "performance"],
|
||||
},
|
||||
"gateway.nodes.browser.mode": {
|
||||
|
||||
@@ -492,7 +492,7 @@ export const FIELD_HELP: Record<string, string> = {
|
||||
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
|
||||
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",
|
||||
"gateway.reload.deferralTimeoutMs":
|
||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||
"gateway.nodes.browser.mode":
|
||||
'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).',
|
||||
"gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).",
|
||||
|
||||
@@ -215,8 +215,8 @@ export type GatewayReloadConfig = {
|
||||
debounceMs?: number;
|
||||
/**
|
||||
* Optional maximum time (ms) to wait for in-flight operations to complete
|
||||
* before forcing a restart. Absent or 0 waits indefinitely and logs periodic
|
||||
* still-pending warnings.
|
||||
* before forcing a restart. Absent uses the gateway's default bounded wait;
|
||||
* 0 waits indefinitely and logs periodic still-pending warnings.
|
||||
* Lower positive values risk aborting active subagent LLM calls.
|
||||
* @see https://github.com/openclaw/openclaw/issues/65485
|
||||
*/
|
||||
|
||||
@@ -12,6 +12,7 @@ import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
|
||||
import {
|
||||
deferGatewayRestartUntilIdle,
|
||||
emitGatewayRestart,
|
||||
resolveGatewayRestartDeferralTimeoutMs,
|
||||
setGatewaySigusr1RestartPolicy,
|
||||
} from "../infra/restart.js";
|
||||
import { getTotalQueueSize } from "../process/command-queue.js";
|
||||
@@ -363,7 +364,9 @@ export function createGatewayReloadHandlers(params: GatewayReloadHandlerParams)
|
||||
|
||||
deferGatewayRestartUntilIdle({
|
||||
getPendingCount: () => getActiveCounts().totalActive,
|
||||
maxWaitMs: nextConfig.gateway?.reload?.deferralTimeoutMs,
|
||||
maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(
|
||||
nextConfig.gateway?.reload?.deferralTimeoutMs,
|
||||
),
|
||||
hooks: {
|
||||
onReady: () => {
|
||||
restartPending = false;
|
||||
|
||||
@@ -727,6 +727,50 @@ describe("gateway hot reload", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("uses the default restart deferral timeout when config omits deferralTimeoutMs", async () => {
|
||||
await withNonMinimalGatewayServer(async () => {
|
||||
const onRestart = hoisted.getOnRestart();
|
||||
expect(onRestart).toBeTypeOf("function");
|
||||
|
||||
const restartTesting = (await import("../infra/restart.js")).__testing;
|
||||
restartTesting.resetSigusr1State();
|
||||
hoisted.activeTaskCount.value = 1;
|
||||
const signalSpy = vi.fn();
|
||||
process.once("SIGUSR1", signalSpy);
|
||||
vi.useFakeTimers();
|
||||
|
||||
try {
|
||||
onRestart?.(
|
||||
{
|
||||
changedPaths: ["gateway.port"],
|
||||
restartGateway: true,
|
||||
restartReasons: ["gateway.port"],
|
||||
hotReasons: [],
|
||||
reloadHooks: false,
|
||||
restartGmailWatcher: false,
|
||||
restartCron: false,
|
||||
restartHeartbeat: false,
|
||||
restartChannels: new Set(),
|
||||
noopPaths: [],
|
||||
},
|
||||
{},
|
||||
);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(299_500);
|
||||
expect(signalSpy).not.toHaveBeenCalled();
|
||||
|
||||
await vi.advanceTimersByTimeAsync(500);
|
||||
await Promise.resolve();
|
||||
expect(signalSpy).toHaveBeenCalledTimes(1);
|
||||
} finally {
|
||||
hoisted.activeTaskCount.value = 0;
|
||||
vi.useRealTimers();
|
||||
process.removeListener("SIGUSR1", signalSpy);
|
||||
restartTesting.resetSigusr1State();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
it("emits one-shot degraded and recovered system events during secret reload transitions", async () => {
|
||||
await writeEnvRefConfig();
|
||||
process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret
|
||||
|
||||
@@ -483,7 +483,7 @@ describe("infra runtime", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps SIGUSR1 deferred by default while work is still pending", async () => {
|
||||
it("emits SIGUSR1 after the default deferral timeout while work is still pending", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
@@ -495,8 +495,25 @@ describe("infra runtime", () => {
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
// No default max deferral wait; active turns should not be killed just
|
||||
// because a config-triggered restart has been pending for 5 minutes.
|
||||
await vi.advanceTimersByTimeAsync(300_000);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps SIGUSR1 deferred when deferral timeout is explicitly disabled", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
setRuntimeConfigSnapshot({ gateway: { reload: { deferralTimeoutMs: 0 } } });
|
||||
setPreRestartDeferralCheck(() => 5); // always pending
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
await vi.advanceTimersByTimeAsync(300_000);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
|
||||
@@ -20,6 +20,7 @@ const SPAWN_TIMEOUT_MS = 2000;
|
||||
const SIGUSR1_AUTH_GRACE_MS = 5000;
|
||||
const DEFAULT_DEFERRAL_POLL_MS = 500;
|
||||
const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000;
|
||||
export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
|
||||
const RESTART_COOLDOWN_MS = 30_000;
|
||||
const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37;
|
||||
const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json";
|
||||
@@ -368,6 +369,16 @@ export type RestartEmitHooks = {
|
||||
afterEmitRejected?: () => Promise<void>;
|
||||
};
|
||||
|
||||
export function resolveGatewayRestartDeferralTimeoutMs(timeoutMs: unknown): number | undefined {
|
||||
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
|
||||
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
|
||||
}
|
||||
if (timeoutMs <= 0) {
|
||||
return undefined;
|
||||
}
|
||||
return Math.floor(timeoutMs);
|
||||
}
|
||||
|
||||
function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void {
|
||||
if (hooks) {
|
||||
pendingRestartEmitHooks = hooks;
|
||||
@@ -729,7 +740,7 @@ export function scheduleGatewaySigusr1Restart(opts?: {
|
||||
const cfg = getRuntimeConfig();
|
||||
deferGatewayRestartUntilIdle({
|
||||
getPendingCount: pendingCheck,
|
||||
maxWaitMs: cfg.gateway?.reload?.deferralTimeoutMs,
|
||||
maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(cfg.gateway?.reload?.deferralTimeoutMs),
|
||||
reason: scheduledReason,
|
||||
});
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user