diff --git a/CHANGELOG.md b/CHANGELOG.md index ad574cd4621..c4cb8692eb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai - Channels/commands: make generated `/dock-*` commands switch the active session reply route through `session.identityLinks` instead of falling through to normal chat. Fixes #69206; carries forward #73033. Thanks @clawbones and @michaelatamuk. - Providers/Cloudflare AI Gateway: strip assistant prefill turns from Anthropic Messages payloads when thinking is enabled, so Claude requests through Cloudflare AI Gateway no longer fail Anthropic conversation-ending validation. Fixes #72905; carries forward #73005. Thanks @AaronFaby and @sahilsatralkar. - Gateway/startup: keep primary-model startup prewarm on scoped metadata preparation, let native approval bootstraps retry outside channel startup, and skip the global hook runner when no `gateway_start` hook is registered, so clean post-ready sidecar work stays off the critical path. Refs #72846. Thanks @RayWoo, @livekm0309, and @mrz1836. +- Gateway/supervisor: exit cleanly when a supervised restart finds an existing healthy gateway and bound retries when the existing gateway stays unhealthy, so stale lock contention cannot loop indefinitely. Refs #72846. Thanks @azgardtek. - Gateway/startup: scope primary-model provider discovery during channel prewarm to the configured provider owner and add split startup trace timings, so boot avoids staging unrelated bundled provider dependencies while setup discovery remains broad. Fixes #73002. Thanks @Schnup03. - Channels/Microsoft Teams: unwrap staged CommonJS JWT runtime dependencies before Bot Connector token validation so inbound Teams messages no longer 401 after the bundled runtime-deps move. Fixes #73026. Thanks @kbrown10000. - Channels/sessions: prevent guarded inbound session recording from creating route-only phantom sessions while still allowing last-route updates for sessions that already exist. Carries forward #73009. Thanks @jzakirov. diff --git a/docs/gateway/gateway-lock.md b/docs/gateway/gateway-lock.md index 524ba0826da..5296128df20 100644 --- a/docs/gateway/gateway-lock.md +++ b/docs/gateway/gateway-lock.md @@ -14,10 +14,11 @@ title: "Gateway lock" ## Mechanism -- The gateway binds the WebSocket listener (default `ws://127.0.0.1:18789`) immediately on startup using an exclusive TCP listener. +- The gateway first acquires a per-config lock file under the state lock directory and probes the configured port for an existing listener. +- If the recorded lock owner is gone, the port is free, or the lock is stale, startup reclaims the lock and continues. +- The gateway then binds the HTTP/WebSocket listener (default `ws://127.0.0.1:18789`) using an exclusive TCP listener. - If the bind fails with `EADDRINUSE`, startup throws `GatewayLockError("another gateway instance is already listening on ws://127.0.0.1:")`. -- The OS releases the listener automatically on any process exit, including crashes and SIGKILL—no separate lock file or cleanup step is needed. -- On shutdown the gateway closes the WebSocket server and underlying HTTP server to free the port promptly. +- On shutdown the gateway closes the HTTP/WebSocket server and removes the lock file. ## Error surface @@ -27,7 +28,8 @@ title: "Gateway lock" ## Operational notes - If the port is occupied by _another_ process, the error is the same; free the port or choose another with `openclaw gateway --port `. -- The macOS app still maintains its own lightweight PID guard before spawning the gateway; the runtime lock is enforced by the WebSocket bind. +- Under a service supervisor, a new gateway process that sees an existing healthy `/healthz` responder exits successfully and leaves that process in control. If the existing process never becomes healthy, retries are bounded and startup fails with a clear lock error instead of looping forever. +- The macOS app still maintains its own lightweight PID guard before spawning the gateway; the runtime lock is enforced by the lock file plus HTTP/WebSocket bind. ## Related diff --git a/src/cli/gateway-cli/run.supervised-lock.test.ts b/src/cli/gateway-cli/run.supervised-lock.test.ts new file mode 100644 index 00000000000..68f74e58188 --- /dev/null +++ b/src/cli/gateway-cli/run.supervised-lock.test.ts @@ -0,0 +1,93 @@ +import { describe, expect, it, vi } from "vitest"; +import { GatewayLockError } from "../../infra/gateway-lock.js"; +import { __testing } from "./run.js"; + +function createLogger() { + return { + info: vi.fn(), + warn: vi.fn(), + }; +} + +describe("supervised gateway lock recovery", () => { + it("does not retry gateway lock errors outside a supervisor", async () => { + const err = new GatewayLockError("gateway already running"); + const startLoop = vi.fn(async () => { + throw err; + }); + + await expect( + __testing.runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: null, + port: 18789, + healthHost: "127.0.0.1", + log: createLogger(), + }), + ).rejects.toBe(err); + + expect(startLoop).toHaveBeenCalledTimes(1); + }); + + it("leaves a healthy supervised gateway in control", async () => { + const startLoop = vi.fn(async () => { + throw new GatewayLockError("gateway already running"); + }); + const probeHealth = vi.fn(async () => true); + const log = createLogger(); + + await __testing.runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: "systemd", + port: 18789, + healthHost: "0.0.0.0", + log, + probeHealth, + }); + + expect(startLoop).toHaveBeenCalledTimes(1); + expect(probeHealth).toHaveBeenCalledWith({ host: "0.0.0.0", port: 18789 }); + expect(log.info).toHaveBeenCalledWith( + "gateway already running under systemd; existing gateway is healthy, leaving it in control", + ); + expect(log.warn).not.toHaveBeenCalled(); + }); + + it("bounds supervised retries when the existing gateway stays unhealthy", async () => { + let now = 0; + const startLoop = vi.fn(async () => { + throw new GatewayLockError("gateway already running"); + }); + const sleep = vi.fn(async (ms: number) => { + now += ms; + }); + + await expect( + __testing.runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: "systemd", + port: 18789, + healthHost: "127.0.0.1", + log: createLogger(), + probeHealth: vi.fn(async () => false), + now: () => now, + sleep, + retryMs: 5, + timeoutMs: 12, + }), + ).rejects.toThrow( + "gateway already running under systemd; existing gateway did not become healthy after 12ms", + ); + + expect(startLoop).toHaveBeenCalledTimes(4); + expect(sleep).toHaveBeenNthCalledWith(1, 5); + expect(sleep).toHaveBeenNthCalledWith(2, 5); + expect(sleep).toHaveBeenNthCalledWith(3, 2); + }); + + it("normalizes wildcard bind hosts for local health probes", () => { + expect(__testing.normalizeGatewayHealthProbeHost("0.0.0.0")).toBe("127.0.0.1"); + expect(__testing.normalizeGatewayHealthProbeHost("::")).toBe("127.0.0.1"); + expect(__testing.normalizeGatewayHealthProbeHost("127.0.0.1")).toBe("127.0.0.1"); + }); +}); diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 82c63e47c19..4cb30d534e1 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -1,4 +1,5 @@ import fs from "node:fs"; +import { request } from "node:http"; import path from "node:path"; import type { Command } from "commander"; import { readSecretFromFile } from "../../acp/secret-file.js"; @@ -111,8 +112,11 @@ const GATEWAY_RUN_BOOLEAN_KEYS = [ ] as const; const SUPERVISED_GATEWAY_LOCK_RETRY_MS = 5000; +const SUPERVISED_GATEWAY_LOCK_RETRY_TIMEOUT_MS = 30_000; +const SUPERVISED_GATEWAY_HEALTH_PROBE_TIMEOUT_MS = 1000; type Awaitable = T | Promise; +type GatewayRunLogger = Pick, "info" | "warn">; /** * EX_CONFIG (78) from sysexits.h — used for configuration errors so systemd @@ -356,6 +360,107 @@ function isHealthyGatewayLockError(err: unknown): boolean { ); } +function normalizeGatewayHealthProbeHost(host: string): string { + if (host === "0.0.0.0" || host === "::") { + return "127.0.0.1"; + } + return host; +} + +async function probeGatewayHealthz(params: { + host: string; + port: number; + timeoutMs?: number; +}): Promise { + const timeoutMs = params.timeoutMs ?? SUPERVISED_GATEWAY_HEALTH_PROBE_TIMEOUT_MS; + return await new Promise((resolve) => { + const req = request( + { + hostname: normalizeGatewayHealthProbeHost(params.host), + port: params.port, + path: "/healthz", + method: "GET", + timeout: timeoutMs, + }, + (res) => { + res.resume(); + resolve(typeof res.statusCode === "number" && res.statusCode < 500); + }, + ); + req.once("timeout", () => { + req.destroy(); + resolve(false); + }); + req.once("error", () => { + resolve(false); + }); + req.end(); + }); +} + +async function runGatewayLoopWithSupervisedLockRecovery(params: { + startLoop: () => Promise; + supervisor: ReturnType; + port: number; + healthHost: string; + log: GatewayRunLogger; + now?: () => number; + sleep?: (ms: number) => Promise; + probeHealth?: (params: { host: string; port: number }) => Promise; + retryMs?: number; + timeoutMs?: number; +}) { + const supervisor = params.supervisor; + if (!supervisor) { + await params.startLoop(); + return; + } + + const now = params.now ?? Date.now; + const sleep = + params.sleep ?? (async (ms: number) => await new Promise((resolve) => setTimeout(resolve, ms))); + const probeHealth = params.probeHealth ?? ((probeParams) => probeGatewayHealthz(probeParams)); + const retryMs = params.retryMs ?? SUPERVISED_GATEWAY_LOCK_RETRY_MS; + const timeoutMs = params.timeoutMs ?? SUPERVISED_GATEWAY_LOCK_RETRY_TIMEOUT_MS; + const startedAt = now(); + + for (;;) { + try { + await params.startLoop(); + return; + } catch (err) { + const isGatewayAlreadyRunning = + err instanceof GatewayLockError && + typeof err.message === "string" && + err.message.includes("gateway already running"); + if (!isGatewayAlreadyRunning) { + throw err; + } + + if (await probeHealth({ host: params.healthHost, port: params.port })) { + params.log.info( + `gateway already running under ${supervisor}; existing gateway is healthy, leaving it in control`, + ); + return; + } + + const elapsedMs = now() - startedAt; + if (elapsedMs >= timeoutMs) { + throw new GatewayLockError( + `gateway already running under ${supervisor}; existing gateway did not become healthy after ${timeoutMs}ms`, + err, + ); + } + + const waitMs = Math.min(retryMs, Math.max(0, timeoutMs - elapsedMs)); + params.log.warn( + `gateway already running under ${supervisor}; waiting ${waitMs}ms before retrying startup`, + ); + await sleep(waitMs); + } + } +} + function maybeWriteGatewayStartupFailureBundle(err: unknown): void { const result = writeDiagnosticStabilityBundleForFailureSync("gateway.startup_failed", err); if ("message" in result) { @@ -680,11 +785,12 @@ async function runGatewayCommand(opts: GatewayRunOpts) { gatewayLog.info("starting..."); startupTrace.mark("cli.gateway-loop"); + const healthHost = await resolveGatewayBindHost(bind, cfg.gateway?.customBindHost); const startLoop = async () => await runGatewayLoop({ runtime: defaultRuntime, lockPort: port, - healthHost: await resolveGatewayBindHost(bind, cfg.gateway?.customBindHost), + healthHost, start: async ({ startupStartedAt } = {}) => await startGatewayServer(port, { bind, @@ -695,25 +801,13 @@ async function runGatewayCommand(opts: GatewayRunOpts) { }); try { - const supervisor = detectRespawnSupervisor(process.env); - while (true) { - try { - await startLoop(); - break; - } catch (err) { - const isGatewayAlreadyRunning = - err instanceof GatewayLockError && - typeof err.message === "string" && - err.message.includes("gateway already running"); - if (!supervisor || !isGatewayAlreadyRunning) { - throw err; - } - gatewayLog.warn( - `gateway already running under ${supervisor}; waiting ${SUPERVISED_GATEWAY_LOCK_RETRY_MS}ms before retrying startup`, - ); - await new Promise((resolve) => setTimeout(resolve, SUPERVISED_GATEWAY_LOCK_RETRY_MS)); - } - } + await runGatewayLoopWithSupervisedLockRecovery({ + startLoop, + supervisor: detectRespawnSupervisor(process.env), + port, + healthHost, + log: gatewayLog, + }); } catch (err) { if (isGatewayLockError(err)) { const errMessage = formatErrorMessage(err); @@ -740,6 +834,11 @@ async function runGatewayCommand(opts: GatewayRunOpts) { } } +export const __testing = { + normalizeGatewayHealthProbeHost, + runGatewayLoopWithSupervisedLockRecovery, +}; + export function addGatewayRunCommand(cmd: Command): Command { return cmd .option("--port ", "Port for the gateway WebSocket")