mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-16 04:00:45 +00:00
fix(gateway): harden macOS update restart lifecycle
Summary: - Clear stale SIGUSR1 restart state before rejected or externally allowed restart handling can leave an in-flight token stuck. - Verify the live gateway version after macOS package-update service refreshes and skip redundant restarts when the refreshed LaunchAgent already serves the expected version. - Set generated LaunchAgents to a 10s throttle plus 20s shutdown window and widen gateway bind retries around supervisor-owned restarts. Fixes #79577. Refs #78699 and #60885. Verification: - pnpm test src/cli/gateway-cli/run-loop.test.ts src/infra/infra-runtime.test.ts - pnpm test src/cli/update-cli.test.ts src/daemon/launchd.test.ts src/gateway/server/http-listen.test.ts - pnpm exec oxfmt --check --threads=1 src/cli/gateway-cli/run-loop.ts src/cli/gateway-cli/run-loop.test.ts - pnpm check:changed - Crabbox/Blacksmith wrapper smoke passed focused tests plus pnpm check:changed: https://github.com/openclaw/openclaw/actions/runs/25595985603 - PR CI was green before upstream main advanced; the latest rebased heads hit unrelated broad lint failures also reproduced on current main CI (for example https://github.com/openclaw/openclaw/actions/runs/25598671666). No failing lint diagnostics referenced this gateway/update diff.
This commit is contained in:
@@ -166,6 +166,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Control UI: refresh the model cache after `session_status(model=...)` changes a session model. Fixes #79613.
|
||||
- Agents/context-engine: share loop-hook checkpoints with the after-turn finalizer so messages are not replayed. Fixes #79630.
|
||||
- Codex app-server: keep native hook relays alive for long-running turns so shell and file approvals stay reachable until the configured run window finishes. (#77533) Thanks @rubencu.
|
||||
- Gateway/macOS: clear ignored SIGUSR1 restart state, skip redundant package-update restarts when the refreshed LaunchAgent already serves the expected version, and give launchd a 10s throttle plus 20s shutdown window so update restarts do not leave old gateways alive or fight supervisor recovery. Fixes #79577; refs #78699 and #60885. Thanks @BunsDev.
|
||||
- Gateway/agent: pass the session-key agent id into inline image attachment validation so the first image in a fresh per-agent session uses the agent's vision-capable model override instead of the text-only system default. Fixes #79407. Thanks @pandadev66.
|
||||
- Gateway/maintenance: prune dedupe overflow against a stable excess count and keep active agent retries from starting duplicate runs after cache eviction. (#73841) Thanks @thesomewhatyou.
|
||||
- Control UI/subagents: suppress internal `subagent_announce` handoff prompts from requester transcripts and hide legacy inter-session wrapper rows so completed subagent results no longer surface runtime context in WebChat history. (#79618) Thanks @joshavant.
|
||||
|
||||
@@ -560,7 +560,7 @@ describe("runGatewayLoop", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("routes external SIGUSR1 through the restart scheduler before draining", async () => {
|
||||
it("clears stale restart state before routing external SIGUSR1 through the scheduler", async () => {
|
||||
vi.clearAllMocks();
|
||||
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
|
||||
isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true);
|
||||
@@ -576,11 +576,37 @@ describe("runGatewayLoop", () => {
|
||||
delayMs: 0,
|
||||
reason: "SIGUSR1",
|
||||
});
|
||||
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1);
|
||||
expect(markGatewaySigusr1RestartHandled.mock.invocationCallOrder[0]).toBeLessThan(
|
||||
scheduleGatewaySigusr1Restart.mock.invocationCallOrder[0] ?? 0,
|
||||
);
|
||||
expect(close).not.toHaveBeenCalled();
|
||||
expect(start).toHaveBeenCalledTimes(1);
|
||||
expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
it("clears the in-flight restart token when an unauthorized SIGUSR1 is ignored", async () => {
|
||||
vi.clearAllMocks();
|
||||
consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false);
|
||||
isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(false);
|
||||
|
||||
await withIsolatedSignals(async ({ captureSignal }) => {
|
||||
const { close, start } = await createSignaledLoopHarness();
|
||||
const sigusr1 = captureSignal("SIGUSR1");
|
||||
|
||||
sigusr1();
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
|
||||
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1);
|
||||
expect(scheduleGatewaySigusr1Restart).not.toHaveBeenCalled();
|
||||
expect(close).not.toHaveBeenCalled();
|
||||
expect(start).toHaveBeenCalledTimes(1);
|
||||
expect(gatewayLog.warn).toHaveBeenCalledWith(
|
||||
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it("releases the lock before exiting on spawned restart", async () => {
|
||||
vi.clearAllMocks();
|
||||
peekGatewaySigusr1RestartReason.mockReturnValue(undefined);
|
||||
|
||||
@@ -487,6 +487,7 @@ export async function runGatewayLoop(params: {
|
||||
}
|
||||
const authorized = consumeGatewaySigusr1RestartAuthorization();
|
||||
if (!authorized) {
|
||||
markGatewaySigusr1RestartHandled();
|
||||
if (!isGatewaySigusr1RestartExternallyAllowed()) {
|
||||
gatewayLog.warn(
|
||||
"SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).",
|
||||
|
||||
@@ -3049,6 +3049,52 @@ describe("update-cli", () => {
|
||||
expect(doctorCommand).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("skips the post-refresh restart script when LaunchAgent already serves the expected package version", async () => {
|
||||
const updatedRoot = createCaseDir("openclaw-updated-root");
|
||||
const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js");
|
||||
setupUpdatedRootRefresh({
|
||||
entrypoints: [updatedEntrypoint],
|
||||
gatewayUpdateImpl: async () =>
|
||||
makeOkUpdateResult({
|
||||
mode: "npm",
|
||||
root: updatedRoot,
|
||||
before: { version: "2026.4.23" },
|
||||
after: { version: "2026.4.24" },
|
||||
}),
|
||||
});
|
||||
serviceLoaded.mockResolvedValue(true);
|
||||
probeGateway.mockResolvedValue({
|
||||
ok: true,
|
||||
close: null,
|
||||
server: {
|
||||
version: "2026.4.24",
|
||||
connId: "updated-gateway",
|
||||
},
|
||||
auth: { role: "operator", scopes: ["operator.read"], capability: "read_only" },
|
||||
health: null,
|
||||
status: null,
|
||||
presence: null,
|
||||
configSnapshot: null,
|
||||
connectLatencyMs: 1,
|
||||
error: null,
|
||||
url: "ws://127.0.0.1:18789",
|
||||
});
|
||||
|
||||
await updateCommand({ yes: true });
|
||||
|
||||
expect(runCommandWithTimeout).toHaveBeenCalledWith(
|
||||
[expect.stringMatching(/node/), updatedEntrypoint, "gateway", "install", "--force"],
|
||||
expect.objectContaining({ cwd: updatedRoot, timeoutMs: 60_000 }),
|
||||
);
|
||||
expect(runCommandWithTimeout).not.toHaveBeenCalledWith(
|
||||
[expect.stringMatching(/node/), updatedEntrypoint, "gateway", "restart"],
|
||||
expect.anything(),
|
||||
);
|
||||
expect(runRestartScript).not.toHaveBeenCalled();
|
||||
expect(probeGateway).toHaveBeenCalledWith(expect.objectContaining({ includeDetails: true }));
|
||||
expect(defaultRuntime.exit).not.toHaveBeenCalledWith(1);
|
||||
});
|
||||
|
||||
it("fails a package update when the restarted gateway reports activated plugin load errors", async () => {
|
||||
const updatedRoot = createCaseDir("openclaw-updated-root");
|
||||
const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js");
|
||||
|
||||
@@ -112,6 +112,8 @@ import { suppressDeprecations } from "./suppress-deprecations.js";
|
||||
|
||||
const CLI_NAME = resolveCliName();
|
||||
const SERVICE_REFRESH_TIMEOUT_MS = 60_000;
|
||||
const POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS = 10;
|
||||
const POST_REFRESH_ALREADY_HEALTHY_DELAY_MS = 500;
|
||||
const DEFAULT_UPDATE_STEP_TIMEOUT_MS = 30 * 60_000;
|
||||
const POST_CORE_UPDATE_ENV = "OPENCLAW_UPDATE_POST_CORE";
|
||||
const POST_CORE_UPDATE_CHANNEL_ENV = "OPENCLAW_UPDATE_POST_CORE_CHANNEL";
|
||||
@@ -1516,6 +1518,7 @@ async function maybeRestartService(params: {
|
||||
const isPackageUpdate = isPackageManagerUpdateMode(params.result.mode);
|
||||
let restarted = false;
|
||||
let restartInitiated = false;
|
||||
let refreshedGatewayAlreadyHealthy = false;
|
||||
if (params.refreshServiceEnv) {
|
||||
try {
|
||||
await refreshGatewayServiceEnv({
|
||||
@@ -1538,25 +1541,51 @@ async function maybeRestartService(params: {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (isPackageUpdate && expectedGatewayVersion) {
|
||||
const health = await waitForGatewayHealthyRestart({
|
||||
service: resolveGatewayService(),
|
||||
port: params.gatewayPort,
|
||||
expectedVersion: expectedGatewayVersion,
|
||||
env: params.serviceEnv,
|
||||
attempts: POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS,
|
||||
delayMs: POST_REFRESH_ALREADY_HEALTHY_DELAY_MS,
|
||||
});
|
||||
refreshedGatewayAlreadyHealthy = health.healthy;
|
||||
if (refreshedGatewayAlreadyHealthy && !params.opts.json) {
|
||||
defaultRuntime.log(
|
||||
theme.muted(
|
||||
"Gateway already reports the updated version after service refresh; skipped redundant restart.",
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (params.restartScriptPath) {
|
||||
// Service refresh can bootstrap a RunAtLoad LaunchAgent directly. When
|
||||
// that already produced the expected gateway version, a second kickstart
|
||||
// would only race the healthy supervisor-owned process.
|
||||
if (!refreshedGatewayAlreadyHealthy && params.restartScriptPath) {
|
||||
await runRestartScript(params.restartScriptPath);
|
||||
restartInitiated = true;
|
||||
} else if (params.refreshServiceEnv && isPackageUpdate) {
|
||||
} else if (!refreshedGatewayAlreadyHealthy && params.refreshServiceEnv && isPackageUpdate) {
|
||||
restarted = await runUpdatedInstallGatewayRestart({
|
||||
result: params.result,
|
||||
jsonMode: Boolean(params.opts.json),
|
||||
invocationCwd: params.invocationCwd,
|
||||
env: params.serviceEnv,
|
||||
});
|
||||
} else if (shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode })) {
|
||||
} else if (
|
||||
!refreshedGatewayAlreadyHealthy &&
|
||||
shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode })
|
||||
) {
|
||||
restarted = await runDaemonRestart();
|
||||
} else if (!params.opts.json) {
|
||||
} else if (!refreshedGatewayAlreadyHealthy && !params.opts.json) {
|
||||
defaultRuntime.log(theme.muted("No installed gateway service found; skipped restart."));
|
||||
}
|
||||
|
||||
const shouldVerifyRestart =
|
||||
restartInitiated || (restarted && expectedGatewayVersion !== undefined);
|
||||
refreshedGatewayAlreadyHealthy ||
|
||||
restartInitiated ||
|
||||
(restarted && expectedGatewayVersion !== undefined);
|
||||
if (shouldVerifyRestart) {
|
||||
const restartHealthy = await verifyRestartedGateway(expectedGatewayVersion);
|
||||
if (!restartHealthy) {
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import fs from "node:fs/promises";
|
||||
import type { GatewayServiceEnvironmentValueSource } from "./service-types.js";
|
||||
|
||||
// launchd applies ThrottleInterval to any rapid relaunch, including
|
||||
// intentional gateway restarts. Keep it low so CLI restarts and forced
|
||||
// reinstalls do not stall for a full minute.
|
||||
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 1;
|
||||
// launchd defaults to a 10s spawn throttle. Keep that default explicitly so
|
||||
// crash loops back off instead of respawning every second while still allowing
|
||||
// explicit kickstart restarts to take effect.
|
||||
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 10;
|
||||
export const LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS = 20;
|
||||
// launchd stores plist integer values in decimal; 0o077 renders as 63 (owner-only files).
|
||||
export const LAUNCH_AGENT_UMASK_DECIMAL = 0o077;
|
||||
|
||||
@@ -178,5 +179,5 @@ export function buildLaunchAgentPlist({
|
||||
? `\n <key>Comment</key>\n <string>${plistEscape(comment.trim())}</string>`
|
||||
: "";
|
||||
const envXml = renderEnvDict(environment);
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <true/>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <true/>\n <key>ExitTimeOut</key>\n <integer>${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}</integer>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { PassThrough } from "node:stream";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import {
|
||||
LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS,
|
||||
LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS,
|
||||
LAUNCH_AGENT_UMASK_DECIMAL,
|
||||
} from "./launchd-plist.js";
|
||||
@@ -562,7 +563,7 @@ describe("launchd install", () => {
|
||||
expect(state.dirModes.get(tmpDir)).toBe(0o700);
|
||||
});
|
||||
|
||||
it("writes KeepAlive=true policy with restrictive umask", async () => {
|
||||
it("writes KeepAlive=true policy with shutdown and throttle limits", async () => {
|
||||
const env = createDefaultLaunchdEnv();
|
||||
await installLaunchAgent({
|
||||
env,
|
||||
@@ -575,6 +576,8 @@ describe("launchd install", () => {
|
||||
expect(plist).toContain("<key>KeepAlive</key>");
|
||||
expect(plist).toContain("<true/>");
|
||||
expect(plist).not.toContain("<key>SuccessfulExit</key>");
|
||||
expect(plist).toContain("<key>ExitTimeOut</key>");
|
||||
expect(plist).toContain(`<integer>${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}</integer>`);
|
||||
expect(plist).toContain("<key>Umask</key>");
|
||||
expect(plist).toContain(`<integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>`);
|
||||
expect(plist).toContain("<key>ThrottleInterval</key>");
|
||||
|
||||
@@ -63,14 +63,9 @@ describe("listenGatewayHttpServer", () => {
|
||||
|
||||
it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => {
|
||||
sleepMock.mockClear();
|
||||
const fake = createFakeHttpServer([
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
{ kind: "error", code: "EADDRINUSE" },
|
||||
]);
|
||||
const fake = createFakeHttpServer(
|
||||
Array.from({ length: 22 }, () => ({ kind: "error" as const, code: "EADDRINUSE" })),
|
||||
);
|
||||
|
||||
await expect(
|
||||
listenGatewayHttpServer({
|
||||
@@ -80,7 +75,7 @@ describe("listenGatewayHttpServer", () => {
|
||||
}),
|
||||
).rejects.toBeInstanceOf(GatewayLockError);
|
||||
|
||||
expect(fake.closeCalls).toBe(4);
|
||||
expect(fake.closeCalls).toBe(20);
|
||||
});
|
||||
|
||||
it("wraps non-EADDRINUSE errors as GatewayLockError", async () => {
|
||||
|
||||
@@ -2,7 +2,7 @@ import type { Server as HttpServer } from "node:http";
|
||||
import { GatewayLockError } from "../../infra/gateway-lock.js";
|
||||
import { sleep } from "../../utils.js";
|
||||
|
||||
const EADDRINUSE_MAX_RETRIES = 4;
|
||||
const EADDRINUSE_MAX_RETRIES = 20;
|
||||
const EADDRINUSE_RETRY_INTERVAL_MS = 500;
|
||||
|
||||
async function closeServerQuietly(httpServer: HttpServer): Promise<void> {
|
||||
|
||||
Reference in New Issue
Block a user