fix: land SIGUSR1 orphan recovery regressions (#47719) (thanks @joeykrug)

2026-05-06 21:20:43 +00:00 · 2026-03-16 05:31:41 +00:00
parent 98f6ec50aa
commit 680eff63fb
7 changed files with 64 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -58,6 +58,7 @@ Docs: https://docs.openclaw.ai
 - Agents/openai-compatible tool calls: deduplicate repeated tool call ids across live assistant messages and replayed history so OpenAI-compatible backends no longer reject duplicate `tool_call_id` values with HTTP 400. (#40996) Thanks @xaeon2026.
 - Security/device pairing: harden `device.token.rotate` deny handling by keeping public failures generic while logging internal deny reasons and preserving approved-baseline enforcement. (`GHSA-7jrw-x62h-64p8`)
 - Slack/interactive replies: preserve `channelData.slack.blocks` through live DM delivery and preview-finalized edits so Block Kit button and select directives render instead of falling back to raw text. (#45890) Thanks @vincentkoc.
+- Gateway/restart: defer externally signaled unmanaged restarts through the in-process idle drain, and preserve the restored subagent run as remap fallback during orphan recovery so resumed sessions do not duplicate work. (#47719) Thanks @joeykrug.
 - Zalo/plugin runtime: export `resolveClientIp` from `openclaw/plugin-sdk/zalo` so installed builds no longer crash on startup when the webhook monitor loads from the packaged extension instead of the monorepo source tree. (#46549) Thanks @No898.
 - CI/channel test routing: move the built-in channel suites into `test:channels` and keep them out of `test:extensions`, so extension CI no longer fails after the channel migration while targeted test routing still sends Slack, Signal, and iMessage suites to the right lane. (#46066) Thanks @scoootscooob.
 - Browser/profiles: drop the auto-created `chrome-relay` browser profile; users who need the Chrome extension relay must now create their own profile via `openclaw browser create-profile`. (#45777) Thanks @odysseus0.
--- a/src/agents/subagent-orphan-recovery.test.ts
+++ b/src/agents/subagent-orphan-recovery.test.ts
@@ -65,8 +65,9 @@ describe("subagent-orphan-recovery", () => {
      "agent:main:subagent:test-session-1": sessionEntry,
    });

+    const run = createTestRunRecord();
    const activeRuns = new Map<string, SubagentRunRecord>();
-    activeRuns.set("run-1", createTestRunRecord());
+    activeRuns.set("run-1", run);

    const { recoverOrphanedSubagentSessions } = await import("./subagent-orphan-recovery.js");

@@ -87,10 +88,13 @@ describe("subagent-orphan-recovery", () => {
    expect(params.sessionKey).toBe("agent:main:subagent:test-session-1");
    expect(params.message).toContain("gateway reload");
    expect(params.message).toContain("Test task: implement feature X");
-    expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith({
-      previousRunId: "run-1",
-      nextRunId: "test-run-id",
-    });
+    expect(subagentRegistry.replaceSubagentRunAfterSteer).toHaveBeenCalledWith(
+      expect.objectContaining({
+        previousRunId: "run-1",
+        nextRunId: "test-run-id",
+        fallback: run,
+      }),
+    );
  });

  it("skips sessions that are not aborted", async () => {
--- a/src/agents/subagent-orphan-recovery.ts
+++ b/src/agents/subagent-orphan-recovery.ts
@@ -82,6 +82,7 @@ async function resumeOrphanedSession(params: {
  lastHumanMessage?: string;
  configChangeHint?: string;
  originalRunId: string;
+  originalRun: SubagentRunRecord;
 }): Promise<boolean> {
  let resumeMessage = buildResumeMessage(params.task, params.lastHumanMessage);
  if (params.configChangeHint) {
@@ -103,6 +104,7 @@ async function resumeOrphanedSession(params: {
    const remapped = replaceSubagentRunAfterSteer({
      previousRunId: params.originalRunId,
      nextRunId: result.runId,
+      fallback: params.originalRun,
    });
    if (!remapped) {
      log.warn(
@@ -210,6 +212,7 @@ export async function recoverOrphanedSubagentSessions(params: {
            ? "\n\n[config changes from your previous run were already applied — do not re-modify openclaw.json or restart the gateway]"
            : undefined,
          originalRunId: runId,
+          originalRun: runRecord,
        });

        if (resumed) {
--- a/src/cli/gateway-cli/run-loop.test.ts
+++ b/src/cli/gateway-cli/run-loop.test.ts
@@ -8,6 +8,15 @@ const acquireGatewayLock = vi.fn(async (_opts?: { port?: number }) => ({
 const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true);
 const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
 const markGatewaySigusr1RestartHandled = vi.fn();
+const scheduleGatewaySigusr1Restart = vi.fn((_opts?: { delayMs?: number; reason?: string }) => ({
+  ok: true,
+  pid: process.pid,
+  signal: "SIGUSR1" as const,
+  delayMs: 0,
+  mode: "emit" as const,
+  coalesced: false,
+  cooldownMsApplied: 0,
+}));
 const getActiveTaskCount = vi.fn(() => 0);
 const markGatewayDraining = vi.fn();
 const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
@@ -35,6 +44,8 @@ vi.mock("../../infra/restart.js", () => ({
  consumeGatewaySigusr1RestartAuthorization: () => consumeGatewaySigusr1RestartAuthorization(),
  isGatewaySigusr1RestartExternallyAllowed: () => isGatewaySigusr1RestartExternallyAllowed(),
  markGatewaySigusr1RestartHandled: () => markGatewaySigusr1RestartHandled(),
+  scheduleGatewaySigusr1Restart: (opts?: { delayMs?: number; reason?: string }) =>
+    scheduleGatewaySigusr1Restart(opts),
 }));

 vi.mock("../../infra/process-respawn.js", () => ({
@@ -292,6 +303,28 @@ describe("runGatewayLoop", () => {
    });
  });

+  it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
+    vi.clearAllMocks();
+    consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
+    isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true);
+
+    await withIsolatedSignals(async ({ captureSignal }) => {
+      const { close, start } = await createSignaledLoopHarness();
+      const sigusr1 = captureSignal("SIGUSR1");
+
+      sigusr1();
+      await new Promise<void>((resolve) => setImmediate(resolve));
+
+      expect(scheduleGatewaySigusr1Restart).toHaveBeenCalledWith({
+        delayMs: 0,
+        reason: "SIGUSR1",
+      });
+      expect(close).not.toHaveBeenCalled();
+      expect(start).toHaveBeenCalledTimes(1);
+      expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled();
+    });
+  });
+
  it("releases the lock before exiting on spawned restart", async () => {
    vi.clearAllMocks();

--- a/src/cli/gateway-cli/run-loop.ts
+++ b/src/cli/gateway-cli/run-loop.ts
@@ -10,6 +10,7 @@ import {
  consumeGatewaySigusr1RestartAuthorization,
  isGatewaySigusr1RestartExternallyAllowed,
  markGatewaySigusr1RestartHandled,
+  scheduleGatewaySigusr1Restart,
 } from "../../infra/restart.js";
 import { createSubsystemLogger } from "../../logging/subsystem.js";
 import {
@@ -186,10 +187,20 @@ export async function runGatewayLoop(params: {
  const onSigusr1 = () => {
    gatewayLog.info("signal SIGUSR1 received");
    const authorized = consumeGatewaySigusr1RestartAuthorization();
-    if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) {
-      gatewayLog.warn(
-        "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
-      );
+    if (!authorized) {
+      if (!isGatewaySigusr1RestartExternallyAllowed()) {
+        gatewayLog.warn(
+          "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
+        );
+        return;
+      }
+      if (shuttingDown) {
+        gatewayLog.info("received SIGUSR1 during shutdown; ignoring");
+        return;
+      }
+      // External SIGUSR1 requests should still reuse the in-process restart
+      // scheduler so idle drain and restart coalescing stay consistent.
+      scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "SIGUSR1" });
      return;
    }
    markGatewaySigusr1RestartHandled();
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -279,6 +279,7 @@ export const FIELD_LABELS: Record<string, string> = {
    "OpenAI Chat Completions Image Timeout (ms)",
  "gateway.reload.mode": "Config Reload Mode",
  "gateway.reload.debounceMs": "Config Reload Debounce (ms)",
+  "gateway.reload.deferralTimeoutMs": "Restart Deferral Timeout (ms)",
  "gateway.nodes.browser.mode": "Gateway Node Browser Mode",
  "gateway.nodes.browser.node": "Gateway Node Browser Pin",
  "gateway.nodes.allowCommands": "Gateway Node Allowlist (Extra Commands)",
--- a/src/infra/infra-runtime.test.ts
+++ b/src/infra/infra-runtime.test.ts
@@ -190,8 +190,8 @@ describe("infra runtime", () => {
        await vi.advanceTimersByTimeAsync(0);
        expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");

-        // Advance past the 90s max deferral wait
-        await vi.advanceTimersByTimeAsync(90_000);
+        // Advance past the 5-minute max deferral wait
+        await vi.advanceTimersByTimeAsync(300_000);
        expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
      } finally {
        process.removeListener("SIGUSR1", handler);