From 9289a502bb070c51dd86e9b3f10538398e36da54 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 30 Apr 2026 16:30:56 +0100 Subject: [PATCH] fix(gateway): stop systemd EADDRINUSE restart loops --- CHANGELOG.md | 1 + docs/gateway/gateway-lock.md | 2 +- .../gateway-cli/run.supervised-lock.test.ts | 76 ++++++++++++++++++- src/cli/gateway-cli/run.ts | 36 ++++++--- 4 files changed, 102 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72d8ab83870..7eb9690e206 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Gateway/systemd: exit with sysexits 78 for supervised lock and `EADDRINUSE` conflicts so `RestartPreventExitStatus=78` stops `Restart=always` restart loops instead of repeatedly reloading plugins against an occupied port. Fixes #75115. Thanks @yhyatt. - Plugins/runtime-deps: replace stale symlinked mirror target roots before writing runtime-mirror temp files and skip rewriting already materialized hardlinks, so cross-version container upgrades no longer crash-loop on read-only image-layer paths while warm mirrors do less churn. Fixes #75108; refs #75069. Thanks @coletebou and @xiaohuaxi. - Auto-reply/group chats: fall back to automatic source delivery when a channel precomputes message-tool-only replies but the `message` tool is unavailable, so Discord/Slack-style group turns do not silently complete without a visible reply. Fixes #74868. Thanks @kagura-agent. - Browser/gateway: share one browser control runtime across the HTTP control server and `browser.request`, and refresh browser profile config from the source snapshot, so CLI status/start honors configured `browser.executablePath`, `headless`, and `noSandbox` instead of falling back to stale auto-detection. Fixes #75087; repairs #73617. Thanks @civiltox and @martingarramon. diff --git a/docs/gateway/gateway-lock.md b/docs/gateway/gateway-lock.md index 5296128df20..07622b87679 100644 --- a/docs/gateway/gateway-lock.md +++ b/docs/gateway/gateway-lock.md @@ -28,7 +28,7 @@ title: "Gateway lock" ## Operational notes - If the port is occupied by _another_ process, the error is the same; free the port or choose another with `openclaw gateway --port `. -- Under a service supervisor, a new gateway process that sees an existing healthy `/healthz` responder exits successfully and leaves that process in control. If the existing process never becomes healthy, retries are bounded and startup fails with a clear lock error instead of looping forever. +- Under a service supervisor, a new gateway process that sees an existing healthy `/healthz` responder leaves that process in control. On systemd, the duplicate starter exits with code 78 so the default `RestartPreventExitStatus=78` stops `Restart=always` from looping on a lock or `EADDRINUSE` conflict. If the existing process never becomes healthy, retries are bounded and startup fails with a clear lock error instead of looping forever. - The macOS app still maintains its own lightweight PID guard before spawning the gateway; the runtime lock is enforced by the lock file plus HTTP/WebSocket bind. ## Related diff --git a/src/cli/gateway-cli/run.supervised-lock.test.ts b/src/cli/gateway-cli/run.supervised-lock.test.ts index 68f74e58188..d954e287680 100644 --- a/src/cli/gateway-cli/run.supervised-lock.test.ts +++ b/src/cli/gateway-cli/run.supervised-lock.test.ts @@ -29,7 +29,7 @@ describe("supervised gateway lock recovery", () => { expect(startLoop).toHaveBeenCalledTimes(1); }); - it("leaves a healthy supervised gateway in control", async () => { + it("leaves a healthy launchd-supervised gateway in control", async () => { const startLoop = vi.fn(async () => { throw new GatewayLockError("gateway already running"); }); @@ -38,7 +38,7 @@ describe("supervised gateway lock recovery", () => { await __testing.runGatewayLoopWithSupervisedLockRecovery({ startLoop, - supervisor: "systemd", + supervisor: "launchd", port: 18789, healthHost: "0.0.0.0", log, @@ -48,11 +48,38 @@ describe("supervised gateway lock recovery", () => { expect(startLoop).toHaveBeenCalledTimes(1); expect(probeHealth).toHaveBeenCalledWith({ host: "0.0.0.0", port: 18789 }); expect(log.info).toHaveBeenCalledWith( - "gateway already running under systemd; existing gateway is healthy, leaving it in control", + "gateway already running under launchd; existing gateway is healthy, leaving it in control", ); expect(log.warn).not.toHaveBeenCalled(); }); + it("uses exit 78 semantics for healthy systemd-supervised lock conflicts", async () => { + const startLoop = vi.fn(async () => { + throw new GatewayLockError("another gateway instance is already listening"); + }); + const probeHealth = vi.fn(async () => true); + + await expect( + __testing.runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: "systemd", + port: 18789, + healthHost: "127.0.0.1", + log: createLogger(), + probeHealth, + }), + ).rejects.toThrow("exiting with code 78 to prevent a systemd Restart=always loop"); + + expect(startLoop).toHaveBeenCalledTimes(1); + expect(probeHealth).toHaveBeenCalledWith({ host: "127.0.0.1", port: 18789 }); + expect( + __testing.resolveGatewayLockErrorExitCode( + new GatewayLockError("gateway already running under systemd; existing gateway is healthy"), + "systemd", + ), + ).toBe(78); + }); + it("bounds supervised retries when the existing gateway stays unhealthy", async () => { let now = 0; const startLoop = vi.fn(async () => { @@ -85,6 +112,49 @@ describe("supervised gateway lock recovery", () => { expect(sleep).toHaveBeenNthCalledWith(3, 2); }); + it("bounds supervised retries for EADDRINUSE lock errors", async () => { + let now = 0; + const startLoop = vi.fn(async () => { + throw new GatewayLockError( + "another gateway instance is already listening on ws://127.0.0.1:18789", + ); + }); + const sleep = vi.fn(async (ms: number) => { + now += ms; + }); + + await expect( + __testing.runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: "systemd", + port: 18789, + healthHost: "127.0.0.1", + log: createLogger(), + probeHealth: vi.fn(async () => false), + now: () => now, + sleep, + retryMs: 5, + timeoutMs: 12, + }), + ).rejects.toThrow( + "gateway already running under systemd; existing gateway did not become healthy after 12ms", + ); + + expect(startLoop).toHaveBeenCalledTimes(4); + expect(sleep).toHaveBeenNthCalledWith(1, 5); + expect(sleep).toHaveBeenNthCalledWith(2, 5); + expect(sleep).toHaveBeenNthCalledWith(3, 2); + }); + + it("keeps unmanaged duplicate starts on the existing exit-success path", () => { + expect( + __testing.resolveGatewayLockErrorExitCode( + new GatewayLockError("another gateway instance is already listening"), + null, + ), + ).toBe(0); + }); + it("normalizes wildcard bind hosts for local health probes", () => { expect(__testing.normalizeGatewayHealthProbeHost("0.0.0.0")).toBe("127.0.0.1"); expect(__testing.normalizeGatewayHealthProbeHost("::")).toBe("127.0.0.1"); diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 38701c56054..9abd816090c 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -362,7 +362,7 @@ function isGatewayLockError(err: unknown): err is GatewayLockError { ); } -function isHealthyGatewayLockError(err: unknown): boolean { +function isGatewayAlreadyRunningLockError(err: unknown): boolean { if (!isGatewayLockError(err) || typeof err.message !== "string") { return false; } @@ -372,6 +372,20 @@ function isHealthyGatewayLockError(err: unknown): boolean { ); } +function isHealthyGatewayLockError(err: unknown): boolean { + return isGatewayAlreadyRunningLockError(err); +} + +function resolveGatewayLockErrorExitCode( + err: unknown, + supervisor: RespawnSupervisor | null, +): number { + if (supervisor === "systemd" && isGatewayAlreadyRunningLockError(err)) { + return EXIT_CONFIG_ERROR; + } + return isHealthyGatewayLockError(err) ? 0 : 1; +} + function normalizeGatewayHealthProbeHost(host: string): string { if (host === "0.0.0.0" || host === "::") { return "127.0.0.1"; @@ -441,15 +455,17 @@ async function runGatewayLoopWithSupervisedLockRecovery(params: { await params.startLoop(); return; } catch (err) { - const isGatewayAlreadyRunning = - err instanceof GatewayLockError && - typeof err.message === "string" && - err.message.includes("gateway already running"); - if (!isGatewayAlreadyRunning) { + if (!isGatewayAlreadyRunningLockError(err)) { throw err; } if (await probeHealth({ host: params.healthHost, port: params.port })) { + if (supervisor === "systemd") { + throw new GatewayLockError( + "gateway already running under systemd; existing gateway is healthy, exiting with code 78 to prevent a systemd Restart=always loop", + err, + ); + } params.log.info( `gateway already running under ${supervisor}; existing gateway is healthy, leaving it in control`, ); @@ -822,11 +838,12 @@ async function runGatewayCommand(opts: GatewayRunOpts) { }), }); + const { detectRespawnSupervisor } = await import("../../infra/supervisor-markers.js"); + const supervisor = detectRespawnSupervisor(process.env); try { - const { detectRespawnSupervisor } = await import("../../infra/supervisor-markers.js"); await runGatewayLoopWithSupervisedLockRecovery({ startLoop, - supervisor: detectRespawnSupervisor(process.env), + supervisor, port, healthHost, log: gatewayLog, @@ -850,7 +867,7 @@ async function runGatewayCommand(opts: GatewayRunOpts) { } const { maybeExplainGatewayServiceStop } = await import("./shared.js"); await maybeExplainGatewayServiceStop(); - defaultRuntime.exit(isHealthyGatewayLockError(err) ? 0 : 1); + defaultRuntime.exit(resolveGatewayLockErrorExitCode(err, supervisor)); return; } await maybeWriteGatewayStartupFailureBundle(err); @@ -861,6 +878,7 @@ async function runGatewayCommand(opts: GatewayRunOpts) { export const __testing = { normalizeGatewayHealthProbeHost, + resolveGatewayLockErrorExitCode, runGatewayLoopWithSupervisedLockRecovery, };