From 833654586e5824b0895aa258db45ba2265e419c1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 28 Apr 2026 06:30:04 +0100 Subject: [PATCH] fix(gateway): keep container restarts in-process --- CHANGELOG.md | 1 + src/gateway/net.ts | 63 ++++-------------------------- src/infra/container-environment.ts | 53 +++++++++++++++++++++++++ src/infra/process-respawn.test.ts | 21 ++++++++++ src/infra/process-respawn.ts | 7 ++++ 5 files changed, 90 insertions(+), 55 deletions(-) create mode 100644 src/infra/container-environment.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index b42bde1a257..0b61988375f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ Docs: https://docs.openclaw.ai - Channels/Telegram: keep Bot API network fallbacks sticky after failed attempts and retry timed-out startup control calls once on the fallback route, so `deleteWebhook` IPv6 stalls no longer trigger slow multi-account retry storms. Fixes #73255. Thanks @ttomiczek and @sktbrd. - Gateway/models: merge explicit `models.providers.*.models` rows into the Gateway model catalog with normalized provider/model dedupe, and use normalized image-capability lookup so custom vision models keep native image attachments even when Pi discovery omits them or model ID casing differs. Fixes #64213 and #65165. Thanks @billonese and @202233a. - Gateway/reload: publish canonical post-write source config to in-process reloaders so simple config saves no longer create phantom plugin diffs or trigger unnecessary Gateway restarts. (#73267) Thanks @szsip239. +- Gateway/Docker: keep config-triggered restarts in-process inside containers instead of spawning a detached child and exiting PID 1 cleanly, so Docker Swarm and other on-failure supervisors do not leave the service stuck at 0/1 replicas. Fixes #73178. Thanks @du-nguyen-IT007. - CLI/tasks: ship the task-registry control runtime in npm packages so `openclaw tasks cancel` can load ACP/subagent cancellation helpers from published builds. Fixes #68997. Thanks @1OAKDesign. - Channels/Telegram: preserve unsent generated media after partial reply streaming has already delivered the text, so `image_generate` outputs still reach Telegram as photos instead of being dropped from the final payload. Fixes #73253. Thanks @mlaihk. - Export/session: keep inline export HTML scripts and vendor libraries injected after template formatting so generated session exports open with the app code, markdown renderer, and syntax highlighter present. Fixes #41862 and #49957; carries forward #41861 and #68947. Thanks @briannewman, @martenzi, and @armanddp. diff --git a/src/gateway/net.ts b/src/gateway/net.ts index ad0f120cd69..3c23227597d 100644 --- a/src/gateway/net.ts +++ b/src/gateway/net.ts @@ -1,7 +1,10 @@ -import fs from "node:fs"; import type { IncomingMessage } from "node:http"; import net from "node:net"; import type { GatewayBindMode } from "../config/types.gateway.js"; +import { + __resetContainerEnvironmentCacheForTest, + isContainerEnvironment, +} from "../infra/container-environment.js"; import { pickMatchingExternalInterfaceAddress, readNetworkInterfaces, @@ -228,60 +231,10 @@ export function isLocalGatewayAddress(ip: string | undefined): boolean { return false; } -/** - * Detect whether the current process is running inside a container - * (Docker, Podman, or Kubernetes). - * - * Uses two reliable heuristics: - * 1. Presence of well-known container sentinel files such as `/.dockerenv` - * (Docker) or `/run/.containerenv` (Podman). - * 2. Presence of container-related cgroup entries in `/proc/1/cgroup` - * (covers Docker, containerd, and Kubernetes pods). - * - * The result is cached after the first call so filesystem access - * happens at most once per process lifetime. - */ -let _containerCacheResult: boolean | undefined; -export function isContainerEnvironment(): boolean { - if (_containerCacheResult !== undefined) { - return _containerCacheResult; - } - _containerCacheResult = detectContainerEnvironment(); - return _containerCacheResult; -} - -function detectContainerEnvironment(): boolean { - // 1. Check common Docker/Podman container sentinel files. - for (const sentinelPath of ["/.dockerenv", "/run/.containerenv", "/var/run/.containerenv"]) { - try { - fs.accessSync(sentinelPath, fs.constants.F_OK); - return true; - } catch { - // not present — continue - } - } - // 2. /proc/1/cgroup contains docker, containerd, kubepods, or lxc markers. - // Covers both cgroup v1 (/docker/, /kubepods/...) and cgroup v2 - // (kubepods.slice, cri-containerd-.scope) path formats. - try { - const cgroup = fs.readFileSync("/proc/1/cgroup", "utf8"); - if ( - /\/docker\/|cri-containerd-[0-9a-f]|containerd\/[0-9a-f]{64}|\/kubepods[/.]|\blxc\b/.test( - cgroup, - ) - ) { - return true; - } - } catch { - // /proc may not exist (macOS, Windows) — not a container - } - return false; -} - -/** @internal — test-only helper to reset the cached container detection result. */ -export function __resetContainerCacheForTest(): void { - _containerCacheResult = undefined; -} +export { + isContainerEnvironment, + __resetContainerEnvironmentCacheForTest as __resetContainerCacheForTest, +}; /** * Resolves gateway bind host with fallback strategy. diff --git a/src/infra/container-environment.ts b/src/infra/container-environment.ts new file mode 100644 index 00000000000..f209226993f --- /dev/null +++ b/src/infra/container-environment.ts @@ -0,0 +1,53 @@ +import fs from "node:fs"; + +/** + * Detect whether the current process is running inside a container + * (Docker, Podman, or Kubernetes). + * + * Uses two reliable heuristics: + * - Presence of common container sentinel files. + * - Container-related entries in /proc/1/cgroup. + * + * The result is cached after the first call so filesystem access happens at + * most once per process lifetime. + */ +let containerEnvironmentCache: boolean | undefined; + +export function isContainerEnvironment(): boolean { + if (containerEnvironmentCache !== undefined) { + return containerEnvironmentCache; + } + containerEnvironmentCache = detectContainerEnvironment(); + return containerEnvironmentCache; +} + +function detectContainerEnvironment(): boolean { + for (const sentinelPath of ["/.dockerenv", "/run/.containerenv", "/var/run/.containerenv"]) { + try { + fs.accessSync(sentinelPath, fs.constants.F_OK); + return true; + } catch { + // Not present; try the next signal. + } + } + + try { + const cgroup = fs.readFileSync("/proc/1/cgroup", "utf8"); + if ( + /\/docker\/|cri-containerd-[0-9a-f]|containerd\/[0-9a-f]{64}|\/kubepods[/.]|\blxc\b/.test( + cgroup, + ) + ) { + return true; + } + } catch { + // /proc may not exist on non-Linux platforms. + } + + return false; +} + +/** @internal test helper */ +export function __resetContainerEnvironmentCacheForTest(): void { + containerEnvironmentCache = undefined; +} diff --git a/src/infra/process-respawn.test.ts b/src/infra/process-respawn.test.ts index e63e0eb7c6d..89e64d401ab 100644 --- a/src/infra/process-respawn.test.ts +++ b/src/infra/process-respawn.test.ts @@ -4,6 +4,7 @@ import { SUPERVISOR_HINT_ENV_VARS } from "./supervisor-markers.js"; const spawnMock = vi.hoisted(() => vi.fn()); const triggerOpenClawRestartMock = vi.hoisted(() => vi.fn()); +const isContainerEnvironmentMock = vi.hoisted(() => vi.fn(() => false)); vi.mock("node:child_process", async () => { const { mockNodeBuiltinModule } = await import("openclaw/plugin-sdk/test-node-mocks"); @@ -17,6 +18,9 @@ vi.mock("node:child_process", async () => { vi.mock("./restart.js", () => ({ triggerOpenClawRestart: (...args: unknown[]) => triggerOpenClawRestartMock(...args), })); +vi.mock("./container-environment.js", () => ({ + isContainerEnvironment: () => isContainerEnvironmentMock(), +})); import { respawnGatewayProcessForUpdate, @@ -44,6 +48,8 @@ afterEach(() => { process.execArgv = [...originalExecArgv]; spawnMock.mockClear(); triggerOpenClawRestartMock.mockClear(); + isContainerEnvironmentMock.mockReset(); + isContainerEnvironmentMock.mockReturnValue(false); if (originalPlatformDescriptor) { Object.defineProperty(process, "platform", originalPlatformDescriptor); } @@ -206,6 +212,21 @@ describe("restartGatewayProcessWithFreshPid", () => { expect(spawnMock).not.toHaveBeenCalled(); }); + it("returns disabled in containers so PID 1 stays alive for in-process restart", () => { + delete process.env.OPENCLAW_NO_RESPAWN; + clearSupervisorHints(); + setPlatform("linux"); + isContainerEnvironmentMock.mockReturnValue(true); + + const result = restartGatewayProcessWithFreshPid(); + + expect(result).toEqual({ + mode: "disabled", + detail: "container: use in-process restart to keep PID 1 alive", + }); + expect(spawnMock).not.toHaveBeenCalled(); + }); + it("ignores node task script hints for gateway restart detection on Windows", () => { clearSupervisorHints(); setPlatform("win32"); diff --git a/src/infra/process-respawn.ts b/src/infra/process-respawn.ts index 8a70660d3da..9926a4e5bb9 100644 --- a/src/infra/process-respawn.ts +++ b/src/infra/process-respawn.ts @@ -1,5 +1,6 @@ import { spawn, type ChildProcess } from "node:child_process"; import { normalizeOptionalLowercaseString } from "../shared/string-coerce.js"; +import { isContainerEnvironment } from "./container-environment.js"; import { formatErrorMessage } from "./errors.js"; import { triggerOpenClawRestart } from "./restart.js"; import { detectRespawnSupervisor } from "./supervisor-markers.js"; @@ -66,6 +67,12 @@ export function restartGatewayProcessWithFreshPid(): GatewayRespawnResult { detail: "win32: detached respawn unsupported without Scheduled Task markers", }; } + if (isContainerEnvironment()) { + return { + mode: "disabled", + detail: "container: use in-process restart to keep PID 1 alive", + }; + } try { const { pid } = spawnDetachedGatewayProcess();