diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ca0374bcc7..5d0a90e0d91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -166,6 +166,7 @@ Docs: https://docs.openclaw.ai - Control UI: refresh the model cache after `session_status(model=...)` changes a session model. Fixes #79613. - Agents/context-engine: share loop-hook checkpoints with the after-turn finalizer so messages are not replayed. Fixes #79630. - Codex app-server: keep native hook relays alive for long-running turns so shell and file approvals stay reachable until the configured run window finishes. (#77533) Thanks @rubencu. +- Gateway/macOS: clear ignored SIGUSR1 restart state, skip redundant package-update restarts when the refreshed LaunchAgent already serves the expected version, and give launchd a 10s throttle plus 20s shutdown window so update restarts do not leave old gateways alive or fight supervisor recovery. Fixes #79577; refs #78699 and #60885. Thanks @BunsDev. - Gateway/agent: pass the session-key agent id into inline image attachment validation so the first image in a fresh per-agent session uses the agent's vision-capable model override instead of the text-only system default. Fixes #79407. Thanks @pandadev66. - Gateway/maintenance: prune dedupe overflow against a stable excess count and keep active agent retries from starting duplicate runs after cache eviction. (#73841) Thanks @thesomewhatyou. - Control UI/subagents: suppress internal `subagent_announce` handoff prompts from requester transcripts and hide legacy inter-session wrapper rows so completed subagent results no longer surface runtime context in WebChat history. (#79618) Thanks @joshavant. diff --git a/src/cli/gateway-cli/run-loop.test.ts b/src/cli/gateway-cli/run-loop.test.ts index de20676a449..76d11b3e8d9 100644 --- a/src/cli/gateway-cli/run-loop.test.ts +++ b/src/cli/gateway-cli/run-loop.test.ts @@ -560,7 +560,7 @@ describe("runGatewayLoop", () => { }); }); - it("routes external SIGUSR1 through the restart scheduler before draining", async () => { + it("clears stale restart state before routing external SIGUSR1 through the scheduler", async () => { vi.clearAllMocks(); consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false); isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(true); @@ -576,11 +576,37 @@ describe("runGatewayLoop", () => { delayMs: 0, reason: "SIGUSR1", }); + expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1); + expect(markGatewaySigusr1RestartHandled.mock.invocationCallOrder[0]).toBeLessThan( + scheduleGatewaySigusr1Restart.mock.invocationCallOrder[0] ?? 0, + ); expect(close).not.toHaveBeenCalled(); expect(start).toHaveBeenCalledTimes(1); - expect(markGatewaySigusr1RestartHandled).not.toHaveBeenCalled(); }); }); + + it("clears the in-flight restart token when an unauthorized SIGUSR1 is ignored", async () => { + vi.clearAllMocks(); + consumeGatewaySigusr1RestartAuthorization.mockReturnValueOnce(false); + isGatewaySigusr1RestartExternallyAllowed.mockReturnValueOnce(false); + + await withIsolatedSignals(async ({ captureSignal }) => { + const { close, start } = await createSignaledLoopHarness(); + const sigusr1 = captureSignal("SIGUSR1"); + + sigusr1(); + await new Promise((resolve) => setImmediate(resolve)); + + expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(1); + expect(scheduleGatewaySigusr1Restart).not.toHaveBeenCalled(); + expect(close).not.toHaveBeenCalled(); + expect(start).toHaveBeenCalledTimes(1); + expect(gatewayLog.warn).toHaveBeenCalledWith( + "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).", + ); + }); + }); + it("releases the lock before exiting on spawned restart", async () => { vi.clearAllMocks(); peekGatewaySigusr1RestartReason.mockReturnValue(undefined); diff --git a/src/cli/gateway-cli/run-loop.ts b/src/cli/gateway-cli/run-loop.ts index b112fd9aef7..c70bbc5cc71 100644 --- a/src/cli/gateway-cli/run-loop.ts +++ b/src/cli/gateway-cli/run-loop.ts @@ -487,6 +487,7 @@ export async function runGatewayLoop(params: { } const authorized = consumeGatewaySigusr1RestartAuthorization(); if (!authorized) { + markGatewaySigusr1RestartHandled(); if (!isGatewaySigusr1RestartExternallyAllowed()) { gatewayLog.warn( "SIGUSR1 restart ignored (not authorized; commands.restart=false or use gateway tool).", diff --git a/src/cli/update-cli.test.ts b/src/cli/update-cli.test.ts index 4a3a33b3870..ef2d74ffd8d 100644 --- a/src/cli/update-cli.test.ts +++ b/src/cli/update-cli.test.ts @@ -3049,6 +3049,52 @@ describe("update-cli", () => { expect(doctorCommand).not.toHaveBeenCalled(); }); + it("skips the post-refresh restart script when LaunchAgent already serves the expected package version", async () => { + const updatedRoot = createCaseDir("openclaw-updated-root"); + const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js"); + setupUpdatedRootRefresh({ + entrypoints: [updatedEntrypoint], + gatewayUpdateImpl: async () => + makeOkUpdateResult({ + mode: "npm", + root: updatedRoot, + before: { version: "2026.4.23" }, + after: { version: "2026.4.24" }, + }), + }); + serviceLoaded.mockResolvedValue(true); + probeGateway.mockResolvedValue({ + ok: true, + close: null, + server: { + version: "2026.4.24", + connId: "updated-gateway", + }, + auth: { role: "operator", scopes: ["operator.read"], capability: "read_only" }, + health: null, + status: null, + presence: null, + configSnapshot: null, + connectLatencyMs: 1, + error: null, + url: "ws://127.0.0.1:18789", + }); + + await updateCommand({ yes: true }); + + expect(runCommandWithTimeout).toHaveBeenCalledWith( + [expect.stringMatching(/node/), updatedEntrypoint, "gateway", "install", "--force"], + expect.objectContaining({ cwd: updatedRoot, timeoutMs: 60_000 }), + ); + expect(runCommandWithTimeout).not.toHaveBeenCalledWith( + [expect.stringMatching(/node/), updatedEntrypoint, "gateway", "restart"], + expect.anything(), + ); + expect(runRestartScript).not.toHaveBeenCalled(); + expect(probeGateway).toHaveBeenCalledWith(expect.objectContaining({ includeDetails: true })); + expect(defaultRuntime.exit).not.toHaveBeenCalledWith(1); + }); + it("fails a package update when the restarted gateway reports activated plugin load errors", async () => { const updatedRoot = createCaseDir("openclaw-updated-root"); const updatedEntrypoint = path.join(updatedRoot, "dist", "entry.js"); diff --git a/src/cli/update-cli/update-command.ts b/src/cli/update-cli/update-command.ts index 3bda5e9f71c..6994edfca81 100644 --- a/src/cli/update-cli/update-command.ts +++ b/src/cli/update-cli/update-command.ts @@ -112,6 +112,8 @@ import { suppressDeprecations } from "./suppress-deprecations.js"; const CLI_NAME = resolveCliName(); const SERVICE_REFRESH_TIMEOUT_MS = 60_000; +const POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS = 10; +const POST_REFRESH_ALREADY_HEALTHY_DELAY_MS = 500; const DEFAULT_UPDATE_STEP_TIMEOUT_MS = 30 * 60_000; const POST_CORE_UPDATE_ENV = "OPENCLAW_UPDATE_POST_CORE"; const POST_CORE_UPDATE_CHANNEL_ENV = "OPENCLAW_UPDATE_POST_CORE_CHANNEL"; @@ -1516,6 +1518,7 @@ async function maybeRestartService(params: { const isPackageUpdate = isPackageManagerUpdateMode(params.result.mode); let restarted = false; let restartInitiated = false; + let refreshedGatewayAlreadyHealthy = false; if (params.refreshServiceEnv) { try { await refreshGatewayServiceEnv({ @@ -1538,25 +1541,51 @@ async function maybeRestartService(params: { return false; } } + if (isPackageUpdate && expectedGatewayVersion) { + const health = await waitForGatewayHealthyRestart({ + service: resolveGatewayService(), + port: params.gatewayPort, + expectedVersion: expectedGatewayVersion, + env: params.serviceEnv, + attempts: POST_REFRESH_ALREADY_HEALTHY_ATTEMPTS, + delayMs: POST_REFRESH_ALREADY_HEALTHY_DELAY_MS, + }); + refreshedGatewayAlreadyHealthy = health.healthy; + if (refreshedGatewayAlreadyHealthy && !params.opts.json) { + defaultRuntime.log( + theme.muted( + "Gateway already reports the updated version after service refresh; skipped redundant restart.", + ), + ); + } + } } - if (params.restartScriptPath) { + // Service refresh can bootstrap a RunAtLoad LaunchAgent directly. When + // that already produced the expected gateway version, a second kickstart + // would only race the healthy supervisor-owned process. + if (!refreshedGatewayAlreadyHealthy && params.restartScriptPath) { await runRestartScript(params.restartScriptPath); restartInitiated = true; - } else if (params.refreshServiceEnv && isPackageUpdate) { + } else if (!refreshedGatewayAlreadyHealthy && params.refreshServiceEnv && isPackageUpdate) { restarted = await runUpdatedInstallGatewayRestart({ result: params.result, jsonMode: Boolean(params.opts.json), invocationCwd: params.invocationCwd, env: params.serviceEnv, }); - } else if (shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode })) { + } else if ( + !refreshedGatewayAlreadyHealthy && + shouldUseLegacyProcessRestartAfterUpdate({ updateMode: params.result.mode }) + ) { restarted = await runDaemonRestart(); - } else if (!params.opts.json) { + } else if (!refreshedGatewayAlreadyHealthy && !params.opts.json) { defaultRuntime.log(theme.muted("No installed gateway service found; skipped restart.")); } const shouldVerifyRestart = - restartInitiated || (restarted && expectedGatewayVersion !== undefined); + refreshedGatewayAlreadyHealthy || + restartInitiated || + (restarted && expectedGatewayVersion !== undefined); if (shouldVerifyRestart) { const restartHealthy = await verifyRestartedGateway(expectedGatewayVersion); if (!restartHealthy) { diff --git a/src/daemon/launchd-plist.ts b/src/daemon/launchd-plist.ts index b0f70a83eac..c442ee9cbcf 100644 --- a/src/daemon/launchd-plist.ts +++ b/src/daemon/launchd-plist.ts @@ -1,10 +1,11 @@ import fs from "node:fs/promises"; import type { GatewayServiceEnvironmentValueSource } from "./service-types.js"; -// launchd applies ThrottleInterval to any rapid relaunch, including -// intentional gateway restarts. Keep it low so CLI restarts and forced -// reinstalls do not stall for a full minute. -export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 1; +// launchd defaults to a 10s spawn throttle. Keep that default explicitly so +// crash loops back off instead of respawning every second while still allowing +// explicit kickstart restarts to take effect. +export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 10; +export const LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS = 20; // launchd stores plist integer values in decimal; 0o077 renders as 63 (owner-only files). export const LAUNCH_AGENT_UMASK_DECIMAL = 0o077; @@ -178,5 +179,5 @@ export function buildLaunchAgentPlist({ ? `\n Comment\n ${plistEscape(comment.trim())}` : ""; const envXml = renderEnvDict(environment); - return `\n\n\n \n Label\n ${plistEscape(label)}\n ${commentXml}\n RunAtLoad\n \n KeepAlive\n \n ThrottleInterval\n ${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}\n Umask\n ${LAUNCH_AGENT_UMASK_DECIMAL}\n ProgramArguments\n ${argsXml}\n \n ${workingDirXml}\n StandardOutPath\n ${plistEscape(stdoutPath)}\n StandardErrorPath\n ${plistEscape(stderrPath)}${envXml}\n \n\n`; + return `\n\n\n \n Label\n ${plistEscape(label)}\n ${commentXml}\n RunAtLoad\n \n KeepAlive\n \n ExitTimeOut\n ${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}\n ThrottleInterval\n ${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}\n Umask\n ${LAUNCH_AGENT_UMASK_DECIMAL}\n ProgramArguments\n ${argsXml}\n \n ${workingDirXml}\n StandardOutPath\n ${plistEscape(stdoutPath)}\n StandardErrorPath\n ${plistEscape(stderrPath)}${envXml}\n \n\n`; } diff --git a/src/daemon/launchd.test.ts b/src/daemon/launchd.test.ts index 8da56149202..c288bb1da3f 100644 --- a/src/daemon/launchd.test.ts +++ b/src/daemon/launchd.test.ts @@ -1,6 +1,7 @@ import { PassThrough } from "node:stream"; import { beforeEach, describe, expect, it, vi } from "vitest"; import { + LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS, LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS, LAUNCH_AGENT_UMASK_DECIMAL, } from "./launchd-plist.js"; @@ -562,7 +563,7 @@ describe("launchd install", () => { expect(state.dirModes.get(tmpDir)).toBe(0o700); }); - it("writes KeepAlive=true policy with restrictive umask", async () => { + it("writes KeepAlive=true policy with shutdown and throttle limits", async () => { const env = createDefaultLaunchdEnv(); await installLaunchAgent({ env, @@ -575,6 +576,8 @@ describe("launchd install", () => { expect(plist).toContain("KeepAlive"); expect(plist).toContain(""); expect(plist).not.toContain("SuccessfulExit"); + expect(plist).toContain("ExitTimeOut"); + expect(plist).toContain(`${LAUNCH_AGENT_EXIT_TIMEOUT_SECONDS}`); expect(plist).toContain("Umask"); expect(plist).toContain(`${LAUNCH_AGENT_UMASK_DECIMAL}`); expect(plist).toContain("ThrottleInterval"); diff --git a/src/gateway/server/http-listen.test.ts b/src/gateway/server/http-listen.test.ts index 12358713faf..89882edd002 100644 --- a/src/gateway/server/http-listen.test.ts +++ b/src/gateway/server/http-listen.test.ts @@ -63,14 +63,9 @@ describe("listenGatewayHttpServer", () => { it("throws GatewayLockError after EADDRINUSE retries are exhausted", async () => { sleepMock.mockClear(); - const fake = createFakeHttpServer([ - { kind: "error", code: "EADDRINUSE" }, - { kind: "error", code: "EADDRINUSE" }, - { kind: "error", code: "EADDRINUSE" }, - { kind: "error", code: "EADDRINUSE" }, - { kind: "error", code: "EADDRINUSE" }, - { kind: "error", code: "EADDRINUSE" }, - ]); + const fake = createFakeHttpServer( + Array.from({ length: 22 }, () => ({ kind: "error" as const, code: "EADDRINUSE" })), + ); await expect( listenGatewayHttpServer({ @@ -80,7 +75,7 @@ describe("listenGatewayHttpServer", () => { }), ).rejects.toBeInstanceOf(GatewayLockError); - expect(fake.closeCalls).toBe(4); + expect(fake.closeCalls).toBe(20); }); it("wraps non-EADDRINUSE errors as GatewayLockError", async () => { diff --git a/src/gateway/server/http-listen.ts b/src/gateway/server/http-listen.ts index 0aa9f7b399f..2472092965f 100644 --- a/src/gateway/server/http-listen.ts +++ b/src/gateway/server/http-listen.ts @@ -2,7 +2,7 @@ import type { Server as HttpServer } from "node:http"; import { GatewayLockError } from "../../infra/gateway-lock.js"; import { sleep } from "../../utils.js"; -const EADDRINUSE_MAX_RETRIES = 4; +const EADDRINUSE_MAX_RETRIES = 20; const EADDRINUSE_RETRY_INTERVAL_MS = 500; async function closeServerQuietly(httpServer: HttpServer): Promise {