mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:40:44 +00:00
fix(gateway): bound default restart deferral
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
d4c98bce7b547349b9cbbe08ec1018eafce9900502d7794df993d07fdec0e2e0 config-baseline.json
|
7f9815a297504c75022c4db2df250ce4cc9ff5c3f69250c67ca253b89148b9f3 config-baseline.json
|
||||||
6ce74b2ab3544e5375009a435a2360a3095e6bd759bb7dd8114293fb8a0e2b25 config-baseline.core.json
|
8bc9fda7c1096472beaa416a61043ce51d691d4dcad9ed3e0be46e68bb70b0ce config-baseline.core.json
|
||||||
0e38bad86bdc96c38573f6d51ac9e6fc5306cc20fb4a454399c57c105a61ba87 config-baseline.channel.json
|
45162ff84813be8a1fe561ed8d6245a248d5c6288ef9e9af51bdf4ec05ef65ad config-baseline.channel.json
|
||||||
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json
|
0dd6583fafae6c9134e46c4cf9bddee9822d6436436dcb1a6dcba6d012962e51 config-baseline.plugin.json
|
||||||
|
|||||||
@@ -515,7 +515,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
|
|||||||
reload: {
|
reload: {
|
||||||
mode: "hybrid", // off | restart | hot | hybrid
|
mode: "hybrid", // off | restart | hot | hybrid
|
||||||
debounceMs: 500,
|
debounceMs: 500,
|
||||||
deferralTimeoutMs: 0,
|
deferralTimeoutMs: 300000,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -527,7 +527,7 @@ See [Multiple Gateways](/gateway/multiple-gateways).
|
|||||||
- `"hot"`: apply changes in-process without restarting.
|
- `"hot"`: apply changes in-process without restarting.
|
||||||
- `"hybrid"` (default): try hot reload first; fall back to restart if required.
|
- `"hybrid"` (default): try hot reload first; fall back to restart if required.
|
||||||
- `debounceMs`: debounce window in ms before config changes are applied (non-negative integer).
|
- `debounceMs`: debounce window in ms before config changes are applied (non-negative integer).
|
||||||
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it or set `0` to wait indefinitely and log periodic still-pending warnings.
|
- `deferralTimeoutMs`: optional maximum time in ms to wait for in-flight operations before forcing a restart. Omit it to use the default bounded wait (`300000`); set `0` to wait indefinitely and log periodic still-pending warnings.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,8 @@ const abortEmbeddedPiRun = vi.fn(
|
|||||||
const getActiveEmbeddedRunCount = vi.fn(() => 0);
|
const getActiveEmbeddedRunCount = vi.fn(() => 0);
|
||||||
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
|
const waitForActiveEmbeddedRuns = vi.fn(async (_timeoutMs?: number) => ({ drained: true }));
|
||||||
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
|
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
|
||||||
const loadConfig = vi.fn(() => ({
|
const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
|
||||||
|
const loadConfig = vi.fn<() => { gateway: { reload: { deferralTimeoutMs?: number } } }>(() => ({
|
||||||
gateway: {
|
gateway: {
|
||||||
reload: {
|
reload: {
|
||||||
deferralTimeoutMs: 90_000,
|
deferralTimeoutMs: 90_000,
|
||||||
@@ -74,6 +75,15 @@ vi.mock("../../infra/restart.js", () => ({
|
|||||||
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
|
markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
|
||||||
peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(),
|
peekGatewaySigusr1RestartReason: () => peekGatewaySigusr1RestartReason(),
|
||||||
resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(),
|
resetGatewayRestartStateForInProcessRestart: () => resetGatewayRestartStateForInProcessRestart(),
|
||||||
|
resolveGatewayRestartDeferralTimeoutMs: (timeoutMs: unknown) => {
|
||||||
|
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
|
||||||
|
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
|
||||||
|
}
|
||||||
|
if (timeoutMs <= 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return Math.floor(timeoutMs);
|
||||||
|
},
|
||||||
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
|
scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
|
||||||
scheduleGatewaySigusr1Restart(opts),
|
scheduleGatewaySigusr1Restart(opts),
|
||||||
}));
|
}));
|
||||||
@@ -438,6 +448,31 @@ describe("runGatewayLoop", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("uses the default restart drain timeout when config omits deferralTimeoutMs", async () => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
loadConfig.mockReturnValue({ gateway: { reload: {} } });
|
||||||
|
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);
|
||||||
|
respawnGatewayProcessForUpdate.mockReturnValue({
|
||||||
|
mode: "disabled",
|
||||||
|
detail: "OPENCLAW_NO_RESPAWN",
|
||||||
|
});
|
||||||
|
|
||||||
|
await withIsolatedSignals(async ({ captureSignal }) => {
|
||||||
|
getActiveTaskCount.mockReturnValueOnce(1).mockReturnValue(0);
|
||||||
|
|
||||||
|
const { start } = await createSignaledLoopHarness();
|
||||||
|
const sigusr1 = captureSignal("SIGUSR1");
|
||||||
|
|
||||||
|
sigusr1();
|
||||||
|
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||||
|
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||||
|
|
||||||
|
expect(waitForActiveTasks).toHaveBeenCalledWith(DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS);
|
||||||
|
expect(markGatewayDraining).toHaveBeenCalledOnce();
|
||||||
|
expect(start).toHaveBeenCalledTimes(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
|
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
|
||||||
vi.clearAllMocks();
|
vi.clearAllMocks();
|
||||||
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
|
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
|
||||||
|
|||||||
@@ -279,11 +279,12 @@ export async function runGatewayLoop(params: {
|
|||||||
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
|
const SHUTDOWN_TIMEOUT_MS = SUPERVISOR_STOP_TIMEOUT_MS - 5_000;
|
||||||
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
|
const resolveRestartDrainTimeoutMs = async (): Promise<RestartDrainTimeoutMs> => {
|
||||||
try {
|
try {
|
||||||
const { getRuntimeConfig } = await loadRuntimeConfigModule();
|
const [{ getRuntimeConfig }, { resolveGatewayRestartDeferralTimeoutMs }] = await Promise.all([
|
||||||
|
loadRuntimeConfigModule(),
|
||||||
|
loadRestartModule(),
|
||||||
|
]);
|
||||||
const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs;
|
const timeoutMs = getRuntimeConfig().gateway?.reload?.deferralTimeoutMs;
|
||||||
return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) && timeoutMs > 0
|
return resolveGatewayRestartDeferralTimeoutMs(timeoutMs);
|
||||||
? timeoutMs
|
|
||||||
: undefined;
|
|
||||||
} catch {
|
} catch {
|
||||||
return DEFAULT_RESTART_DRAIN_TIMEOUT_MS;
|
return DEFAULT_RESTART_DRAIN_TIMEOUT_MS;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22632,7 +22632,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
|||||||
maximum: 9007199254740991,
|
maximum: 9007199254740991,
|
||||||
title: "Restart Deferral Timeout (ms)",
|
title: "Restart Deferral Timeout (ms)",
|
||||||
description:
|
description:
|
||||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
additionalProperties: false,
|
additionalProperties: false,
|
||||||
@@ -25803,7 +25803,7 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
|
|||||||
},
|
},
|
||||||
"gateway.reload.deferralTimeoutMs": {
|
"gateway.reload.deferralTimeoutMs": {
|
||||||
label: "Restart Deferral Timeout (ms)",
|
label: "Restart Deferral Timeout (ms)",
|
||||||
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
help: "Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||||
tags: ["network", "reliability", "performance"],
|
tags: ["network", "reliability", "performance"],
|
||||||
},
|
},
|
||||||
"gateway.nodes.browser.mode": {
|
"gateway.nodes.browser.mode": {
|
||||||
|
|||||||
@@ -492,7 +492,7 @@ export const FIELD_HELP: Record<string, string> = {
|
|||||||
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
|
'Controls how config edits are applied: "off" ignores live edits, "restart" always restarts, "hot" applies in-process, and "hybrid" tries hot then restarts if required. Keep "hybrid" for safest routine updates.',
|
||||||
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",
|
"gateway.reload.debounceMs": "Debounce window (ms) before applying config changes.",
|
||||||
"gateway.reload.deferralTimeoutMs":
|
"gateway.reload.deferralTimeoutMs":
|
||||||
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit or set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
"Optional maximum time (ms) to wait for in-flight operations before forcing a restart. Omit to use the default bounded wait; set 0 to wait indefinitely with periodic still-pending warnings. Lower positive values risk aborting active subagent LLM calls.",
|
||||||
"gateway.nodes.browser.mode":
|
"gateway.nodes.browser.mode":
|
||||||
'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).',
|
'Node browser routing ("auto" = pick single connected browser node, "manual" = require node param, "off" = disable).',
|
||||||
"gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).",
|
"gateway.nodes.browser.node": "Pin browser routing to a specific node id or name (optional).",
|
||||||
|
|||||||
@@ -215,8 +215,8 @@ export type GatewayReloadConfig = {
|
|||||||
debounceMs?: number;
|
debounceMs?: number;
|
||||||
/**
|
/**
|
||||||
* Optional maximum time (ms) to wait for in-flight operations to complete
|
* Optional maximum time (ms) to wait for in-flight operations to complete
|
||||||
* before forcing a restart. Absent or 0 waits indefinitely and logs periodic
|
* before forcing a restart. Absent uses the gateway's default bounded wait;
|
||||||
* still-pending warnings.
|
* 0 waits indefinitely and logs periodic still-pending warnings.
|
||||||
* Lower positive values risk aborting active subagent LLM calls.
|
* Lower positive values risk aborting active subagent LLM calls.
|
||||||
* @see https://github.com/openclaw/openclaw/issues/65485
|
* @see https://github.com/openclaw/openclaw/issues/65485
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
|
|||||||
import {
|
import {
|
||||||
deferGatewayRestartUntilIdle,
|
deferGatewayRestartUntilIdle,
|
||||||
emitGatewayRestart,
|
emitGatewayRestart,
|
||||||
|
resolveGatewayRestartDeferralTimeoutMs,
|
||||||
setGatewaySigusr1RestartPolicy,
|
setGatewaySigusr1RestartPolicy,
|
||||||
} from "../infra/restart.js";
|
} from "../infra/restart.js";
|
||||||
import { getTotalQueueSize } from "../process/command-queue.js";
|
import { getTotalQueueSize } from "../process/command-queue.js";
|
||||||
@@ -363,7 +364,9 @@ export function createGatewayReloadHandlers(params: GatewayReloadHandlerParams)
|
|||||||
|
|
||||||
deferGatewayRestartUntilIdle({
|
deferGatewayRestartUntilIdle({
|
||||||
getPendingCount: () => getActiveCounts().totalActive,
|
getPendingCount: () => getActiveCounts().totalActive,
|
||||||
maxWaitMs: nextConfig.gateway?.reload?.deferralTimeoutMs,
|
maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(
|
||||||
|
nextConfig.gateway?.reload?.deferralTimeoutMs,
|
||||||
|
),
|
||||||
hooks: {
|
hooks: {
|
||||||
onReady: () => {
|
onReady: () => {
|
||||||
restartPending = false;
|
restartPending = false;
|
||||||
|
|||||||
@@ -727,6 +727,50 @@ describe("gateway hot reload", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("uses the default restart deferral timeout when config omits deferralTimeoutMs", async () => {
|
||||||
|
await withNonMinimalGatewayServer(async () => {
|
||||||
|
const onRestart = hoisted.getOnRestart();
|
||||||
|
expect(onRestart).toBeTypeOf("function");
|
||||||
|
|
||||||
|
const restartTesting = (await import("../infra/restart.js")).__testing;
|
||||||
|
restartTesting.resetSigusr1State();
|
||||||
|
hoisted.activeTaskCount.value = 1;
|
||||||
|
const signalSpy = vi.fn();
|
||||||
|
process.once("SIGUSR1", signalSpy);
|
||||||
|
vi.useFakeTimers();
|
||||||
|
|
||||||
|
try {
|
||||||
|
onRestart?.(
|
||||||
|
{
|
||||||
|
changedPaths: ["gateway.port"],
|
||||||
|
restartGateway: true,
|
||||||
|
restartReasons: ["gateway.port"],
|
||||||
|
hotReasons: [],
|
||||||
|
reloadHooks: false,
|
||||||
|
restartGmailWatcher: false,
|
||||||
|
restartCron: false,
|
||||||
|
restartHeartbeat: false,
|
||||||
|
restartChannels: new Set(),
|
||||||
|
noopPaths: [],
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
);
|
||||||
|
|
||||||
|
await vi.advanceTimersByTimeAsync(299_500);
|
||||||
|
expect(signalSpy).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
await vi.advanceTimersByTimeAsync(500);
|
||||||
|
await Promise.resolve();
|
||||||
|
expect(signalSpy).toHaveBeenCalledTimes(1);
|
||||||
|
} finally {
|
||||||
|
hoisted.activeTaskCount.value = 0;
|
||||||
|
vi.useRealTimers();
|
||||||
|
process.removeListener("SIGUSR1", signalSpy);
|
||||||
|
restartTesting.resetSigusr1State();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it("emits one-shot degraded and recovered system events during secret reload transitions", async () => {
|
it("emits one-shot degraded and recovered system events during secret reload transitions", async () => {
|
||||||
await writeEnvRefConfig();
|
await writeEnvRefConfig();
|
||||||
process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret
|
process.env.OPENAI_API_KEY = "sk-startup"; // pragma: allowlist secret
|
||||||
|
|||||||
@@ -483,7 +483,7 @@ describe("infra runtime", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it("keeps SIGUSR1 deferred by default while work is still pending", async () => {
|
it("emits SIGUSR1 after the default deferral timeout while work is still pending", async () => {
|
||||||
const emitSpy = vi.spyOn(process, "emit");
|
const emitSpy = vi.spyOn(process, "emit");
|
||||||
const handler = () => {};
|
const handler = () => {};
|
||||||
process.on("SIGUSR1", handler);
|
process.on("SIGUSR1", handler);
|
||||||
@@ -495,8 +495,25 @@ describe("infra runtime", () => {
|
|||||||
await vi.advanceTimersByTimeAsync(0);
|
await vi.advanceTimersByTimeAsync(0);
|
||||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||||
|
|
||||||
// No default max deferral wait; active turns should not be killed just
|
await vi.advanceTimersByTimeAsync(300_000);
|
||||||
// because a config-triggered restart has been pending for 5 minutes.
|
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||||
|
} finally {
|
||||||
|
process.removeListener("SIGUSR1", handler);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps SIGUSR1 deferred when deferral timeout is explicitly disabled", async () => {
|
||||||
|
const emitSpy = vi.spyOn(process, "emit");
|
||||||
|
const handler = () => {};
|
||||||
|
process.on("SIGUSR1", handler);
|
||||||
|
try {
|
||||||
|
setRuntimeConfigSnapshot({ gateway: { reload: { deferralTimeoutMs: 0 } } });
|
||||||
|
setPreRestartDeferralCheck(() => 5); // always pending
|
||||||
|
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||||
|
|
||||||
|
await vi.advanceTimersByTimeAsync(0);
|
||||||
|
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||||
|
|
||||||
await vi.advanceTimersByTimeAsync(300_000);
|
await vi.advanceTimersByTimeAsync(300_000);
|
||||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||||
} finally {
|
} finally {
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ const SPAWN_TIMEOUT_MS = 2000;
|
|||||||
const SIGUSR1_AUTH_GRACE_MS = 5000;
|
const SIGUSR1_AUTH_GRACE_MS = 5000;
|
||||||
const DEFAULT_DEFERRAL_POLL_MS = 500;
|
const DEFAULT_DEFERRAL_POLL_MS = 500;
|
||||||
const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000;
|
const DEFAULT_DEFERRAL_STILL_PENDING_WARN_MS = 30_000;
|
||||||
|
export const DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS = 300_000;
|
||||||
const RESTART_COOLDOWN_MS = 30_000;
|
const RESTART_COOLDOWN_MS = 30_000;
|
||||||
const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37;
|
const LAUNCHCTL_ALREADY_LOADED_EXIT_CODE = 37;
|
||||||
const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json";
|
const GATEWAY_RESTART_INTENT_FILENAME = "gateway-restart-intent.json";
|
||||||
@@ -368,6 +369,16 @@ export type RestartEmitHooks = {
|
|||||||
afterEmitRejected?: () => Promise<void>;
|
afterEmitRejected?: () => Promise<void>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export function resolveGatewayRestartDeferralTimeoutMs(timeoutMs: unknown): number | undefined {
|
||||||
|
if (typeof timeoutMs !== "number" || !Number.isFinite(timeoutMs)) {
|
||||||
|
return DEFAULT_RESTART_DEFERRAL_TIMEOUT_MS;
|
||||||
|
}
|
||||||
|
if (timeoutMs <= 0) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return Math.floor(timeoutMs);
|
||||||
|
}
|
||||||
|
|
||||||
function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void {
|
function updatePendingRestartEmitHooks(hooks?: RestartEmitHooks): void {
|
||||||
if (hooks) {
|
if (hooks) {
|
||||||
pendingRestartEmitHooks = hooks;
|
pendingRestartEmitHooks = hooks;
|
||||||
@@ -729,7 +740,7 @@ export function scheduleGatewaySigusr1Restart(opts?: {
|
|||||||
const cfg = getRuntimeConfig();
|
const cfg = getRuntimeConfig();
|
||||||
deferGatewayRestartUntilIdle({
|
deferGatewayRestartUntilIdle({
|
||||||
getPendingCount: pendingCheck,
|
getPendingCount: pendingCheck,
|
||||||
maxWaitMs: cfg.gateway?.reload?.deferralTimeoutMs,
|
maxWaitMs: resolveGatewayRestartDeferralTimeoutMs(cfg.gateway?.reload?.deferralTimeoutMs),
|
||||||
reason: scheduledReason,
|
reason: scheduledReason,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user