diff --git a/CHANGELOG.md b/CHANGELOG.md index d8040d79aef..4aaf84db974 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ Docs: https://docs.openclaw.ai - Agents/openai-compatible tool calls: deduplicate repeated tool call ids across live assistant messages and replayed history so OpenAI-compatible backends no longer reject duplicate `tool_call_id` values with HTTP 400. (#40996) Thanks @xaeon2026. - Security/device pairing: harden `device.token.rotate` deny handling by keeping public failures generic while logging internal deny reasons and preserving approved-baseline enforcement. (`GHSA-7jrw-x62h-64p8`) - Slack/interactive replies: preserve `channelData.slack.blocks` through live DM delivery and preview-finalized edits so Block Kit button and select directives render instead of falling back to raw text. (#45890) Thanks @vincentkoc. +- Gateway/restart: defer externally signaled unmanaged restarts through the in-process idle drain, and preserve the restored subagent run as remap fallback during orphan recovery so resumed sessions do not duplicate work. (#47719) Thanks @joeykrug. - Zalo/plugin runtime: export `resolveClientIp` from `openclaw/plugin-sdk/zalo` so installed builds no longer crash on startup when the webhook monitor loads from the packaged extension instead of the monorepo source tree. (#46549) Thanks @No898. - CI/channel test routing: move the built-in channel suites into `test:channels` and keep them out of `test:extensions`, so extension CI no longer fails after the channel migration while targeted test routing still sends Slack, Signal, and iMessage suites to the right lane. (#46066) Thanks @scoootscooob. - Browser/profiles: drop the auto-created `chrome-relay` browser profile; users who need the Chrome extension relay must now create their own profile via `openclaw browser create-profile`. (#45777) Thanks @odysseus0. diff --git a/src/agents/subagent-orphan-recovery.test.ts b/src/agents/subagent-orphan-recovery.test.ts index 56b652b3b42..66b8097154c 100644 --- a/src/agents/subagent-orphan-recovery.test.ts +++ b/src/agents/subagent-orphan-recovery.test.ts @@ -65,8 +65,9 @@ describe("subagent-orphan-recovery", () => { "agent:main:subagent:test-session-1": sessionEntry, }); + const run = createTestRunRecord(); const activeRuns = new Map(); - activeRuns.set("run-1", createTestRunRecord()); + activeRuns.set("run-1", run); const { recoverOrphanedSubagentSessions } = await import("./subagent-orphan-recovery.js"); @@ -87,10 +88,13 @@ describe("subagent-orphan-recovery", () => { expect(params.sessionKey).toBe("agent:main:subagent:test-session-1"); expect(params.message).toContain("gateway reload"); expect(params.message).toContain("Test task: implement feature X"); - expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith({ - previousRunId: "run-1", - nextRunId: "test-run-id", - }); + expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith( + expect.objectContaining({ + previousRunId: "run-1", + nextRunId: "test-run-id", + fallback: run, + }), + ); }); it("skips sessions that are not aborted", async () => { diff --git a/src/agents/subagent-orphan-recovery.ts b/src/agents/subagent-orphan-recovery.ts index ed2eac6d8f3..60408c09ae9 100644 --- a/src/agents/subagent-orphan-recovery.ts +++ b/src/agents/subagent-orphan-recovery.ts @@ -82,6 +82,7 @@ async function resumeOrphanedSession(params: { lastHumanMessage?: string; configChangeHint?: string; originalRunId: string; + originalRun: SubagentRunRecord; }): Promise { let resumeMessage = buildResumeMessage(params.task, params.lastHumanMessage); if (params.configChangeHint) { @@ -103,6 +104,7 @@ async function resumeOrphanedSession(params: { const remapped = replaceSubagentRunAfterSteer({ previousRunId: params.originalRunId, nextRunId: result.runId, + fallback: params.originalRun, }); if (!remapped) { log.warn( @@ -210,6 +212,7 @@ export async function recoverOrphanedSubagentSessions(params: { ? "\n\n[config changes from your previous run were already applied — do not re-modify openclaw.json or restart the gateway]" : undefined, originalRunId: runId, + originalRun: runRecord, }); if (resumed) { diff --git a/src/cli/gateway-cli/run-loop.test.ts b/src/cli/gateway-cli/run-loop.test.ts index bff37742254..ce8fbccbe93 100644 --- a/src/cli/gateway-cli/run-loop.test.ts +++ b/src/cli/gateway-cli/run-loop.test.ts @@ -8,6 +8,15 @@ const acquireGatewayLock = vi.fn(async (_opts?: { port?: number }) => ({ const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true); const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false); const markGatewaySigusr1RestartHandled = vi.fn(); +const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?: string }) => ({ + ok: true, + pid: process.pid, + signal: "SIGUSR1" as const, + delayMs: 0, + mode: "emit" as const, + coalesced: false, + cooldownMsApplied: 0, +})); const getActiveTaskCount = vi.fn(() => 0); const markGatewayDraining = vi.fn(); const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true })); @@ -35,6 +44,8 @@ vi.mock("../../infra/restart.js", () => ({ consumeGatewaySigusr1RestartAuthorization: () => consumeGatewaySigusr1RestartAuthorization(), isGatewaySigusr1RestartExternallyAllowed: () => isGatewaySigusr1RestartExternallyAllowed(), markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(), + scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) => + scheduleGatewaySigusr1Restart(opts), })); vi.mock("../../infra/process-respawn.js", () => ({ @@ -292,6 +303,28 @@ describe("runGatewayLoop", () => { }); }); + it("routes external SIGUSR1 through the restart scheduler before draining", async () => { + vi.clearAllMocks(); + consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false); + isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true); + + await withIsolatedSignals(async ({ captureSignal }) => { + const { close, start } = await createSignaledLoopHarness(); + const sigusr1 = captureSignal("SIGUSR1"); + + sigusr1(); + await new Promise((resolve) => setImmediate(resolve)); + + expect(scheduleGatewaySigusr1Restart).toHaveBeenCalledWith({ + delayMs: 0, + reason: "SIGUSR1", + }); + expect(close).not.toHaveBeenCalled(); + expect(start).toHaveBeenCalledTimes(1); + expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled(); + }); + }); + it("releases the lock before exiting on spawned restart", async () => { vi.clearAllMocks(); diff --git a/src/cli/gateway-cli/run-loop.ts b/src/cli/gateway-cli/run-loop.ts index 13ef073a80d..23ec7dd584d 100644 --- a/src/cli/gateway-cli/run-loop.ts +++ b/src/cli/gateway-cli/run-loop.ts @@ -10,6 +10,7 @@ import { consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed, markGatewaySigusr1RestartHandled, + scheduleGatewaySigusr1Restart, } from "../../infra/restart.js"; import { createSubsystemLogger } from "../../logging/subsystem.js"; import { @@ -186,10 +187,20 @@ export async function runGatewayLoop(params: { const onSigusr1 = () => { gatewayLog.info("signal SIGUSR1 received"); const authorized = consumeGatewaySigusr1RestartAuthorization(); - if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) { - gatewayLog.warn( - "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).", - ); + if (!authorized) { + if (!isGatewaySigusr1RestartExternallyAllowed()) { + gatewayLog.warn( + "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).", + ); + return; + } + if (shuttingDown) { + gatewayLog.info("received SIGUSR1 during shutdown; ignoring"); + return; + } + // External SIGUSR1 requests should still reuse the in-process restart + // scheduler so idle drain and restart coalescing stay consistent. + scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "SIGUSR1" }); return; } markGatewaySigusr1RestartHandled(); diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index d2c0cb29e48..c8fb887924b 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -279,6 +279,7 @@ export const FIELD_LABELS: Record = { "OpenAI Chat Completions Image Timeout (ms)", "gateway.reload.mode": "Config Reload Mode", "gateway.reload.debounceMs": "Config Reload Debounce (ms)", + "gateway.reload.deferralTimeoutMs": "Restart Deferral Timeout (ms)", "gateway.nodes.browser.mode": "Gateway Node Browser Mode", "gateway.nodes.browser.node": "Gateway Node Browser Pin", "gateway.nodes.allowCommands": "Gateway Node Allowlist (Extra Commands)", diff --git a/src/infra/infra-runtime.test.ts b/src/infra/infra-runtime.test.ts index 2072f8f2da3..97f2336fd11 100644 --- a/src/infra/infra-runtime.test.ts +++ b/src/infra/infra-runtime.test.ts @@ -190,8 +190,8 @@ describe("infra runtime", () => { await vi.advanceTimersByTimeAsync(0); expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1"); - // Advance past the 90s max deferral wait - await vi.advanceTimersByTimeAsync(90_000); + // Advance past the 5-minute max deferral wait + await vi.advanceTimersByTimeAsync(300_000); expect(emitSpy).toHaveBeenCalledWith("SIGUSR1"); } finally { process.removeListener("SIGUSR1", handler);