fix(gateway): harden macOS update restart lifecycle

Summary:
- Clear stale SIGUSR1 restart state before rejected or externally allowed restart handling can leave an in-flight token stuck.
- Verify the live gateway version after macOS package-update service refreshes and skip redundant restarts when the refreshed LaunchAgent already serves the expected version.
- Set generated LaunchAgents to a 10s throttle plus 20s shutdown window and widen gateway bind retries around supervisor-owned restarts.

Fixes #79577. Refs #78699 and #60885.

Verification:
- pnpm test src/cli/gateway-cli/run-loop.test.ts src/infra/infra-runtime.test.ts
- pnpm test src/cli/update-cli.test.ts src/daemon/launchd.test.ts src/gateway/server/http-listen.test.ts
- pnpm exec oxfmt --check --threads=1 src/cli/gateway-cli/run-loop.ts src/cli/gateway-cli/run-loop.test.ts
- pnpm check:changed
- Crabbox/Blacksmith wrapper smoke passed focused tests plus pnpm check:changed: https://github.com/openclaw/openclaw/actions/runs/25595985603
- PR CI was green before upstream main advanced; the latest rebased heads hit unrelated broad lint failures also reproduced on current main CI (for example https://github.com/openclaw/openclaw/actions/runs/25598671666). No failing lint diagnostics referenced this gateway/update diff.
This commit is contained in:
Val Alexander
2026-05-09 05:21:17 -05:00
committed by GitHub
parent 4e8914861f
commit fa79e9754e
9 changed files with 125 additions and 23 deletions

View File

@@ -166,6 +166,7 @@ Docs: https://docs.openclaw.ai
- Control UI: refresh the model cache after `session_status(model=...)` changes a session model. Fixes #79613.
- Agents/context-engine: share loop-hook checkpoints with the after-turn finalizer so messages are not replayed. Fixes #79630.
- Codex app-server: keep native hook relays alive for long-running turns so shell and file approvals stay reachable until the configured run window finishes. (#77533) Thanks @rubencu.
- Gateway/macOS: clear ignored SIGUSR1 restart state, skip redundant package-update restarts when the refreshed LaunchAgent already serves the expected version, and give launchd a 10s throttle plus 20s shutdown window so update restarts do not leave old gateways alive or fight supervisor recovery. Fixes #79577; refs #78699 and #60885. Thanks @BunsDev.
- Gateway/agent: pass the session-key agent id into inline image attachment validation so the first image in a fresh per-agent session uses the agent's vision-capable model override instead of the text-only system default. Fixes #79407. Thanks @pandadev66.
- Gateway/maintenance: prune dedupe overflow against a stable excess count and keep active agent retries from starting duplicate runs after cache eviction. (#73841) Thanks @thesomewhatyou.
- Control UI/subagents: suppress internal `subagent_announce` handoff prompts from requester transcripts and hide legacy inter-session wrapper rows so completed subagent results no longer surface runtime context in WebChat history. (#79618) Thanks @joshavant.

View File

@@ -560,7 +560,7 @@ describe("runGatewayLoop", () => {
});
});
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
it("clears stale restart state before routing external SIGUSR1 through the scheduler", async () => {
vi.clearAllMocks();
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true);
@@ -576,11 +576,37 @@ describe("runGatewayLoop", () => {
delayMs: 0,
reason: "SIGUSR1",
});
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1);
expect(markGatewaySigusr1RestartHandled.mock.invocationCallOrder[0]).toBeLessThan(
scheduleGatewaySigusr1Restart.mock.invocationCallOrder[0] ?? 0,
);
expect(close).not.toHaveBeenCalled();
expect(start).toHaveBeenCalledTimes(1);
expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled();
});
});
it("clears the in-flight restart token when an unauthorized SIGUSR1 is ignored", async () => {
vi.clearAllMocks();
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(false);
await withIsolatedSignals(async ({ captureSignal }) => {
const { close, start } = await createSignaledLoopHarness();
const sigusr1 = captureSignal("SIGUSR1");
sigusr1();
await new Promise<void>((resolve) => setImmediate(resolve));
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1);
expect(scheduleGatewaySigusr1Restart).not.toHaveBeenCalled();
expect(close).not.toHaveBeenCalled();
expect(start).toHaveBeenCalledTimes(1);
expect(gatewayLog.warn).toHaveBeenCalledWith(
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
);
});
});
it("releases the lock before exiting on spawned restart", async () => {
vi.clearAllMocks();
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);

View File

@@ -487,6 +487,7 @@ export async function runGatewayLoop(params: {
}
const authorized = consumeGatewaySigusr1RestartAuthorization();
if (!authorized) {
markGatewaySigusr1RestartHandled();
if (!isGatewaySigusr1RestartExternallyAllowed()) {
gatewayLog.warn(
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",

View File

@@ -3049,6 +3049,52 @@ describe("update-cli", () => {
expect(doctorCommand).not.toHaveBeenCalled();
});
it("skips the post-refresh restart script when LaunchAgent already serves the expected package version", async () => {
const updatedRoot = createCaseDir("openclaw-updated-root");
const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js");
setupUpdatedRootRefresh({
entrypoints: [updatedEntrypoint],
gatewayUpdateImpl: async () =>
makeOkUpdateResult({
mode: "npm",
root: updatedRoot,
before: { version: "2026.4.23" },
after: { version: "2026.4.24" },
}),
});
serviceLoaded.mockResolvedValue(true);
probeGateway.mockResolvedValue({
ok: true,
close: null,
server: {
version: "2026.4.24",
connId: "updated-gateway",
},
auth: { role: "operator", scopes: ["operator.read"], capability: "read_only" },
health: null,
status: null,
presence: null,
configSnapshot: null,
connectLatencyMs: 1,
error: null,
url: "ws://127.0.0.1:18789",
});
await updateCommand({ yes: true });
expect(runCommandWithTimeout).toHaveBeenCalledWith(
[expect.stringMatching(/node/), updatedEntrypoint, "gateway", "install", "--force"],
expect.objectContaining({ cwd: updatedRoot, timeoutMs: 60_000 }),
);
expect(runCommandWithTimeout).not.toHaveBeenCalledWith(
[expect.stringMatching(/node/), updatedEntrypoint, "gateway", "restart"],
expect.anything(),
);
expect(runRestartScript).not.toHaveBeenCalled();
expect(probeGateway).toHaveBeenCalledWith(expect.objectContaining({ includeDetails: true }));
expect(defaultRuntime.exit).not.toHaveBeenCalledWith(1);
});
it("fails a package update when the restarted gateway reports activated plugin load errors", async () => {
const updatedRoot = createCaseDir("openclaw-updated-root");
const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js");

View File

@@ -112,6 +112,8 @@ import { suppressDeprecations } from "./suppress-deprecations.js";
const CLI_NAME = resolveCliName();
const SERVICE_REFRESH_TIMEOUT_MS = 60_000;
const POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS = 10;
const POST_REFRESH_ALREADY_HEALTHY_DELAY_MS = 500;
const DEFAULT_UPDATE_STEP_TIMEOUT_MS = 30 * 60_000;
const POST_CORE_UPDATE_ENV = "OPENCLAW_UPDATE_POST_CORE";
const POST_CORE_UPDATE_CHANNEL_ENV = "OPENCLAW_UPDATE_POST_CORE_CHANNEL";
@@ -1516,6 +1518,7 @@ async function maybeRestartService(params: {
const isPackageUpdate = isPackageManagerUpdateMode(params.result.mode);
let restarted = false;
let restartInitiated = false;
let refreshedGatewayAlreadyHealthy = false;
if (params.refreshServiceEnv) {
try {
await refreshGatewayServiceEnv({
@@ -1538,25 +1541,51 @@ async function maybeRestartService(params: {
return false;
}
}
if (isPackageUpdate && expectedGatewayVersion) {
const health = await waitForGatewayHealthyRestart({
service: resolveGatewayService(),
port: params.gatewayPort,
expectedVersion: expectedGatewayVersion,
env: params.serviceEnv,
attempts: POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS,
delayMs: POST_REFRESH_ALREADY_HEALTHY_DELAY_MS,
});
refreshedGatewayAlreadyHealthy = health.healthy;
if (refreshedGatewayAlreadyHealthy && !params.opts.json) {
defaultRuntime.log(
theme.muted(
"Gateway already reports the updated version after service refresh; skipped redundant restart.",
),
);
}
}
}
if (params.restartScriptPath) {
// Service refresh can bootstrap a RunAtLoad LaunchAgent directly. When
// that already produced the expected gateway version, a second kickstart
// would only race the healthy supervisor-owned process.
if (!refreshedGatewayAlreadyHealthy && params.restartScriptPath) {
await runRestartScript(params.restartScriptPath);
restartInitiated = true;
} else if (params.refreshServiceEnv && isPackageUpdate) {
} else if (!refreshedGatewayAlreadyHealthy && params.refreshServiceEnv && isPackageUpdate) {
restarted = await runUpdatedInstallGatewayRestart({
result: params.result,
jsonMode: Boolean(params.opts.json),
invocationCwd: params.invocationCwd,
env: params.serviceEnv,
});
} else if (shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode })) {
} else if (
!refreshedGatewayAlreadyHealthy &&
shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode })
) {
restarted = await runDaemonRestart();
} else if (!params.opts.json) {
} else if (!refreshedGatewayAlreadyHealthy && !params.opts.json) {
defaultRuntime.log(theme.muted("No installed gateway service found; skipped restart."));
}
const shouldVerifyRestart =
restartInitiated || (restarted && expectedGatewayVersion !== undefined);
refreshedGatewayAlreadyHealthy ||
restartInitiated ||
(restarted && expectedGatewayVersion !== undefined);
if (shouldVerifyRestart) {
const restartHealthy = await verifyRestartedGateway(expectedGatewayVersion);
if (!restartHealthy) {

View File

@@ -1,10 +1,11 @@
import fs from "node:fs/promises";
import type { GatewayServiceEnvironmentValueSource } from "./service-types.js";
// launchd applies ThrottleInterval to any rapid relaunch, including
// intentional gateway restarts. Keep it low so CLI restarts and forced
// reinstalls do not stall for a full minute.
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 1;
// launchd defaults to a 10s spawn throttle. Keep that default explicitly so
// crash loops back off instead of respawning every second while still allowing
// explicit kickstart restarts to take effect.
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 10;
export const LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS = 20;
// launchd stores plist integer values in decimal; 0o077 renders as 63 (owner-only files).
export const LAUNCH_AGENT_UMASK_DECIMAL = 0o077;
@@ -178,5 +179,5 @@ export function buildLaunchAgentPlist({
? `\n <key>Comment</key>\n <string>${plistEscape(comment.trim())}</string>`
: "";
const envXml = renderEnvDict(environment);
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <true/>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <true/>\n <key>ExitTimeOut</key>\n <integer>${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}</integer>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
}

View File

@@ -1,6 +1,7 @@
import { PassThrough } from "node:stream";
import { beforeEach, describe, expect, it, vi } from "vitest";
import {
LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS,
LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS,
LAUNCH_AGENT_UMASK_DECIMAL,
} from "./launchd-plist.js";
@@ -562,7 +563,7 @@ describe("launchd install", () => {
expect(state.dirModes.get(tmpDir)).toBe(0o700);
});
it("writes KeepAlive=true policy with restrictive umask", async () => {
it("writes KeepAlive=true policy with shutdown and throttle limits", async () => {
const env = createDefaultLaunchdEnv();
await installLaunchAgent({
env,
@@ -575,6 +576,8 @@ describe("launchd install", () => {
expect(plist).toContain("<key>KeepAlive</key>");
expect(plist).toContain("<true/>");
expect(plist).not.toContain("<key>SuccessfulExit</key>");
expect(plist).toContain("<key>ExitTimeOut</key>");
expect(plist).toContain(`<integer>${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}</integer>`);
expect(plist).toContain("<key>Umask</key>");
expect(plist).toContain(`<integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>`);
expect(plist).toContain("<key>ThrottleInterval</key>");

View File

@@ -63,14 +63,9 @@ describe("listenGatewayHttpServer", () => {
it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => {
sleepMock.mockClear();
const fake = createFakeHttpServer([
{ kind: "error", code: "EADDRINUSE" },
{ kind: "error", code: "EADDRINUSE" },
{ kind: "error", code: "EADDRINUSE" },
{ kind: "error", code: "EADDRINUSE" },
{ kind: "error", code: "EADDRINUSE" },
{ kind: "error", code: "EADDRINUSE" },
]);
const fake = createFakeHttpServer(
Array.from({ length: 22 }, () => ({ kind: "error" as const, code: "EADDRINUSE" })),
);
await expect(
listenGatewayHttpServer({
@@ -80,7 +75,7 @@ describe("listenGatewayHttpServer", () => {
}),
).rejects.toBeInstanceOf(GatewayLockError);
expect(fake.closeCalls).toBe(4);
expect(fake.closeCalls).toBe(20);
});
it("wraps non-EADDRINUSE errors as GatewayLockError", async () => {

View File

@@ -2,7 +2,7 @@ import type { Server as HttpServer } from "node:http";
import { GatewayLockError } from "../../infra/gateway-lock.js";
import { sleep } from "../../utils.js";
const EADDRINUSE_MAX_RETRIES = 4;
const EADDRINUSE_MAX_RETRIES = 20;
const EADDRINUSE_RETRY_INTERVAL_MS = 500;
async function closeServerQuietly(httpServer: HttpServer): Promise<void> {