From 39f7dbfe02ce99136e2c79b5868a39a24b9f99ca Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 26 Feb 2026 23:02:47 +0100 Subject: [PATCH] fix(cli): make gateway --force resilient to lsof EACCES --- CHANGELOG.md | 1 + src/cli/ports.ts | 188 +++++++++++++++++++++++++++++++--- src/cli/program.force.test.ts | 93 ++++++++++++++++- 3 files changed, 266 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0973119584f..9170f6bc7db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ Docs: https://docs.openclaw.ai - Doctor/State integrity: ignore metadata-only slash routing sessions when checking recent missing transcripts so `openclaw doctor` no longer reports false-positive transcript-missing warnings for `*:slash:*` keys. (#27375) thanks @gumadeiras. - CLI/Gateway status: force local `gateway status` probe host to `127.0.0.1` for `bind=lan` so co-located probes do not trip non-loopback plaintext WebSocket checks. (#26997) thanks @chikko80. - CLI/Gateway auth: align `gateway run --auth` parsing/help text with supported gateway auth modes by accepting `none` and `trusted-proxy` (in addition to `token`/`password`) for CLI overrides. (#27469) thanks @s1korrrr. +- CLI/Gateway `--force` in non-root Docker: recover from `lsof` permission failures (`EACCES`/`EPERM`) by falling back to `fuser` kill + probe-based port checks, so `openclaw gateway --force` works for default container `node` user flows. (#27941) - CLI/Daemon status TLS probe: use `wss://` and forward local TLS certificate fingerprint for TLS-enabled gateway daemon probes so `openclaw daemon status` works with `gateway.bind=lan` + `gateway.tls.enabled=true`. (#24234) thanks @liuy. - Podman/Default bind: change `run-openclaw-podman.sh` default gateway bind from `lan` to `loopback` and document explicit LAN opt-in with Control UI origin configuration. (#27491) thanks @robbyczgw-cla. - Daemon/macOS launchd: forward proxy env vars into supervised service environments, keep LaunchAgent `KeepAlive=true` semantics, and harden restart sequencing to `print -> bootout -> wait old pid exit -> bootstrap -> kickstart`. (#27276) thanks @frankekn. diff --git a/src/cli/ports.ts b/src/cli/ports.ts index ab5a3979979..30ebd3f4123 100644 --- a/src/cli/ports.ts +++ b/src/cli/ports.ts @@ -1,5 +1,6 @@ import { execFileSync } from "node:child_process"; import { resolveLsofCommandSync } from "../infra/ports-lsof.js"; +import { tryListenOnPort } from "../infra/ports-probe.js"; import { sleep } from "../utils.js"; export type PortProcess = { pid: number; command?: string }; @@ -10,6 +11,132 @@ export type ForceFreePortResult = { escalatedToSigkill: boolean; }; +type ExecFileError = NodeJS.ErrnoException & { + status?: number | null; + stderr?: string | Buffer; + stdout?: string | Buffer; + cause?: unknown; +}; + +const FUSER_SIGNALS: Record<"SIGTERM" | "SIGKILL", string> = { + SIGTERM: "TERM", + SIGKILL: "KILL", +}; + +function readExecOutput(value: string | Buffer | undefined): string { + if (typeof value === "string") { + return value; + } + if (value instanceof Buffer) { + return value.toString("utf8"); + } + return ""; +} + +function withErrnoCode(message: string, code: string, cause: unknown): Error { + const out = new Error(message, { cause: cause instanceof Error ? cause : undefined }) as Error & + NodeJS.ErrnoException; + out.code = code; + return out; +} + +function getErrnoCode(err: unknown): string | undefined { + if (!err || typeof err !== "object") { + return undefined; + } + const direct = (err as { code?: unknown }).code; + if (typeof direct === "string" && direct.length > 0) { + return direct; + } + const cause = (err as { cause?: unknown }).cause; + if (cause && typeof cause === "object") { + const nested = (cause as { code?: unknown }).code; + if (typeof nested === "string" && nested.length > 0) { + return nested; + } + } + return undefined; +} + +function isRecoverableLsofError(err: unknown): boolean { + const code = getErrnoCode(err); + if (code === "ENOENT" || code === "EACCES" || code === "EPERM") { + return true; + } + const message = err instanceof Error ? err.message : String(err); + return /lsof.*(permission denied|not permitted|operation not permitted|eacces|eperm)/i.test( + message, + ); +} + +function parseFuserPidList(output: string): number[] { + if (!output) { + return []; + } + const values = new Set(); + for (const rawLine of output.split(/\r?\n/)) { + const line = rawLine.trim(); + if (!line) { + continue; + } + const pidRegion = line.includes(":") ? line.slice(line.indexOf(":") + 1) : line; + const pidMatches = pidRegion.match(/\d+/g) ?? []; + for (const match of pidMatches) { + const pid = Number.parseInt(match, 10); + if (Number.isFinite(pid) && pid > 0) { + values.add(pid); + } + } + } + return [...values]; +} + +function killPortWithFuser(port: number, signal: "SIGTERM" | "SIGKILL"): PortProcess[] { + const args = ["-k", `-${FUSER_SIGNALS[signal]}`, `${port}/tcp`]; + try { + const stdout = execFileSync("fuser", args, { + encoding: "utf-8", + stdio: ["ignore", "pipe", "pipe"], + }); + return parseFuserPidList(stdout).map((pid) => ({ pid })); + } catch (err: unknown) { + const execErr = err as ExecFileError; + const code = execErr.code; + const status = execErr.status; + const stdout = readExecOutput(execErr.stdout); + const stderr = readExecOutput(execErr.stderr); + const parsed = parseFuserPidList([stdout, stderr].filter(Boolean).join("\n")); + if (status === 1) { + // fuser exits 1 if nothing matched; keep any parsed PIDs in case signal succeeded. + return parsed.map((pid) => ({ pid })); + } + if (code === "ENOENT") { + throw withErrnoCode( + "fuser not found; required for --force when lsof is unavailable", + "ENOENT", + err, + ); + } + if (code === "EACCES" || code === "EPERM") { + throw withErrnoCode("fuser permission denied while forcing gateway port", code, err); + } + throw err instanceof Error ? err : new Error(String(err)); + } +} + +async function isPortBusy(port: number): Promise { + try { + await tryListenOnPort({ port, exclusive: true }); + return false; + } catch (err: unknown) { + const code = (err as NodeJS.ErrnoException).code; + if (code === "EADDRINUSE") { + return true; + } + throw err instanceof Error ? err : new Error(String(err)); + } +} + export function parseLsofOutput(output: string): PortProcess[] { const lines = output.split(/\r?\n/).filter(Boolean); const results: PortProcess[] = []; @@ -38,12 +165,27 @@ export function listPortListeners(port: number): PortProcess[] { }); return parseLsofOutput(out); } catch (err: unknown) { - const status = (err as { status?: number }).status; - const code = (err as { code?: string }).code; + const execErr = err as ExecFileError; + const status = execErr.status ?? undefined; + const code = execErr.code; if (code === "ENOENT") { - throw new Error("lsof not found; required for --force", { cause: err }); + throw withErrnoCode("lsof not found; required for --force", "ENOENT", err); + } + if (code === "EACCES" || code === "EPERM") { + throw withErrnoCode("lsof permission denied while inspecting gateway port", code, err); } if (status === 1) { + const stderr = readExecOutput(execErr.stderr).trim(); + if ( + stderr && + /permission denied|not permitted|operation not permitted|can't stat/i.test(stderr) + ) { + throw withErrnoCode( + `lsof permission denied while inspecting gateway port: ${stderr}`, + "EACCES", + err, + ); + } return []; } // no listeners throw err instanceof Error ? err : new Error(String(err)); @@ -93,43 +235,65 @@ export async function forceFreePortAndWait( const intervalMs = Math.max(opts.intervalMs ?? 100, 1); const sigtermTimeoutMs = Math.min(Math.max(opts.sigtermTimeoutMs ?? 600, 0), timeoutMs); - const killed = forceFreePort(port); - if (killed.length === 0) { + let killed: PortProcess[] = []; + let useFuserFallback = false; + + try { + killed = forceFreePort(port); + } catch (err) { + if (!isRecoverableLsofError(err)) { + throw err; + } + useFuserFallback = true; + killed = killPortWithFuser(port, "SIGTERM"); + } + + const checkBusy = async (): Promise => + useFuserFallback ? isPortBusy(port) : listPortListeners(port).length > 0; + + if (!(await checkBusy())) { return { killed, waitedMs: 0, escalatedToSigkill: false }; } let waitedMs = 0; const triesSigterm = intervalMs > 0 ? Math.ceil(sigtermTimeoutMs / intervalMs) : 0; for (let i = 0; i < triesSigterm; i++) { - if (listPortListeners(port).length === 0) { + if (!(await checkBusy())) { return { killed, waitedMs, escalatedToSigkill: false }; } await sleep(intervalMs); waitedMs += intervalMs; } - if (listPortListeners(port).length === 0) { + if (!(await checkBusy())) { return { killed, waitedMs, escalatedToSigkill: false }; } - const remaining = listPortListeners(port); - killPids(remaining, "SIGKILL"); + if (useFuserFallback) { + killPortWithFuser(port, "SIGKILL"); + } else { + const remaining = listPortListeners(port); + killPids(remaining, "SIGKILL"); + } const remainingBudget = Math.max(timeoutMs - waitedMs, 0); const triesSigkill = intervalMs > 0 ? Math.ceil(remainingBudget / intervalMs) : 0; for (let i = 0; i < triesSigkill; i++) { - if (listPortListeners(port).length === 0) { + if (!(await checkBusy())) { return { killed, waitedMs, escalatedToSigkill: true }; } await sleep(intervalMs); waitedMs += intervalMs; } - const still = listPortListeners(port); - if (still.length === 0) { + if (!(await checkBusy())) { return { killed, waitedMs, escalatedToSigkill: true }; } + if (useFuserFallback) { + throw new Error(`port ${port} still has listeners after --force (fuser fallback)`); + } + const still = listPortListeners(port); throw new Error( `port ${port} still has listeners after --force: ${still.map((p) => p.pid).join(", ")}`, ); diff --git a/src/cli/program.force.test.ts b/src/cli/program.force.test.ts index 2152b132922..ac0f02904bf 100644 --- a/src/cli/program.force.test.ts +++ b/src/cli/program.force.test.ts @@ -8,6 +8,12 @@ vi.mock("node:child_process", async () => { }; }); +const tryListenOnPortMock = vi.hoisted(() => vi.fn()); + +vi.mock("../infra/ports-probe.js", () => ({ + tryListenOnPort: (...args: unknown[]) => tryListenOnPortMock(...args), +})); + import { execFileSync } from "node:child_process"; import { forceFreePort, @@ -23,6 +29,7 @@ describe("gateway --force helpers", () => { beforeEach(() => { vi.clearAllMocks(); originalKill = process.kill.bind(process); + tryListenOnPortMock.mockReset(); }); afterEach(() => { @@ -80,11 +87,13 @@ describe("gateway --force helpers", () => { let call = 0; (execFileSync as unknown as Mock).mockImplementation(() => { call += 1; - // 1st call: initial listeners to kill; 2nd call: still listed; 3rd call: gone. + // 1st call: initial listeners to kill. + // 2nd/3rd calls: still listed. + // 4th call: gone. if (call === 1) { return ["p42", "cnode", ""].join("\n"); } - if (call === 2) { + if (call === 2 || call === 3) { return ["p42", "cnode", ""].join("\n"); } return ""; @@ -105,7 +114,7 @@ describe("gateway --force helpers", () => { expect(killMock).toHaveBeenCalledWith(42, "SIGTERM"); expect(res.killed).toEqual([{ pid: 42, command: "node" }]); expect(res.escalatedToSigkill).toBe(false); - expect(res.waitedMs).toBeGreaterThan(0); + expect(res.waitedMs).toBe(100); vi.useRealTimers(); }); @@ -116,7 +125,7 @@ describe("gateway --force helpers", () => { (execFileSync as unknown as Mock).mockImplementation(() => { call += 1; // 1st call: initial kill list; then keep showing until after SIGKILL. - if (call <= 6) { + if (call <= 7) { return ["p42", "cnode", ""].join("\n"); } return ""; @@ -140,4 +149,80 @@ describe("gateway --force helpers", () => { vi.useRealTimers(); }); + + it("falls back to fuser when lsof is permission denied", async () => { + (execFileSync as unknown as Mock).mockImplementation((cmd: string) => { + if (cmd.includes("lsof")) { + const err = new Error("spawnSync lsof EACCES") as NodeJS.ErrnoException; + err.code = "EACCES"; + throw err; + } + return "18789/tcp: 4242\n"; + }); + tryListenOnPortMock.mockResolvedValue(undefined); + + const result = await forceFreePortAndWait(18789, { timeoutMs: 500, intervalMs: 100 }); + + expect(result.escalatedToSigkill).toBe(false); + expect(result.killed).toEqual([{ pid: 4242 }]); + expect(execFileSync).toHaveBeenCalledWith( + "fuser", + ["-k", "-TERM", "18789/tcp"], + expect.objectContaining({ encoding: "utf-8" }), + ); + }); + + it("uses fuser SIGKILL escalation when port stays busy", async () => { + vi.useFakeTimers(); + (execFileSync as unknown as Mock).mockImplementation((cmd: string, args: string[]) => { + if (cmd.includes("lsof")) { + const err = new Error("spawnSync lsof EACCES") as NodeJS.ErrnoException; + err.code = "EACCES"; + throw err; + } + if (args.includes("-TERM")) { + return "18789/tcp: 1337\n"; + } + if (args.includes("-KILL")) { + return "18789/tcp: 1337\n"; + } + return ""; + }); + + const busyErr = Object.assign(new Error("in use"), { code: "EADDRINUSE" }); + tryListenOnPortMock + .mockRejectedValueOnce(busyErr) + .mockRejectedValueOnce(busyErr) + .mockRejectedValueOnce(busyErr) + .mockResolvedValueOnce(undefined); + + const promise = forceFreePortAndWait(18789, { + timeoutMs: 300, + intervalMs: 100, + sigtermTimeoutMs: 100, + }); + await vi.runAllTimersAsync(); + const result = await promise; + + expect(result.escalatedToSigkill).toBe(true); + expect(result.waitedMs).toBe(100); + expect(execFileSync).toHaveBeenCalledWith( + "fuser", + ["-k", "-KILL", "18789/tcp"], + expect.objectContaining({ encoding: "utf-8" }), + ); + vi.useRealTimers(); + }); + + it("throws when lsof is unavailable and fuser is missing", async () => { + (execFileSync as unknown as Mock).mockImplementation((cmd: string) => { + const err = new Error(`spawnSync ${cmd} ENOENT`) as NodeJS.ErrnoException; + err.code = "ENOENT"; + throw err; + }); + + await expect(forceFreePortAndWait(18789, { timeoutMs: 200, intervalMs: 100 })).rejects.toThrow( + /fuser not found/i, + ); + }); });