From bcb8475691e2e10e7db335d70e8b31e4c623b693 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 6 Mar 2026 14:46:01 -0500 Subject: [PATCH] Daemon: back off unhealthy gateway restarts --- src/daemon/launchd-plist.ts | 10 +++++----- src/daemon/launchd.test.ts | 7 ++++--- src/daemon/service-audit.ts | 5 ++++- src/daemon/systemd-unit.test.ts | 10 ++++++++++ src/daemon/systemd-unit.ts | 2 +- 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/daemon/launchd-plist.ts b/src/daemon/launchd-plist.ts index fa2a780a5c8..a9b489af438 100644 --- a/src/daemon/launchd-plist.ts +++ b/src/daemon/launchd-plist.ts @@ -1,9 +1,9 @@ import fs from "node:fs/promises"; -// launchd applies ThrottleInterval to any rapid relaunch, including -// intentional gateway restarts. Keep it low so CLI restarts and forced -// reinstalls do not stall for a full minute. -export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 1; +// launchd applies ThrottleInterval to any rapid relaunch, including config-crash +// loops. Intentional gateway restarts use launchctl kickstart, so a higher value +// here primarily slows unhealthy restart storms without making operator restarts sluggish. +export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 30; // launchd stores plist integer values in decimal; 0o077 renders as 63 (owner-only files). export const LAUNCH_AGENT_UMASK_DECIMAL = 0o077; @@ -113,5 +113,5 @@ export function buildLaunchAgentPlist({ ? `\n Comment\n ${plistEscape(comment.trim())}` : ""; const envXml = renderEnvDict(environment); - return `\n\n\n \n Label\n ${plistEscape(label)}\n ${commentXml}\n RunAtLoad\n \n KeepAlive\n \n ThrottleInterval\n ${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}\n Umask\n ${LAUNCH_AGENT_UMASK_DECIMAL}\n ProgramArguments\n ${argsXml}\n \n ${workingDirXml}\n StandardOutPath\n ${plistEscape(stdoutPath)}\n StandardErrorPath\n ${plistEscape(stderrPath)}${envXml}\n \n\n`; + return `\n\n\n \n Label\n ${plistEscape(label)}\n ${commentXml}\n RunAtLoad\n \n KeepAlive\n \n SuccessfulExit\n \n \n ThrottleInterval\n ${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}\n Umask\n ${LAUNCH_AGENT_UMASK_DECIMAL}\n ProgramArguments\n ${argsXml}\n \n ${workingDirXml}\n StandardOutPath\n ${plistEscape(stdoutPath)}\n StandardErrorPath\n ${plistEscape(stderrPath)}${envXml}\n \n\n`; } diff --git a/src/daemon/launchd.test.ts b/src/daemon/launchd.test.ts index ca94f8b5602..f1b7574eb46 100644 --- a/src/daemon/launchd.test.ts +++ b/src/daemon/launchd.test.ts @@ -189,7 +189,7 @@ describe("launchd install", () => { expect(plist).toContain(`${tmpDir}`); }); - it("writes KeepAlive=true policy with restrictive umask", async () => { + it("writes KeepAlive restart-on-failure policy with restrictive umask", async () => { const env = createDefaultLaunchdEnv(); await installLaunchAgent({ env, @@ -200,8 +200,9 @@ describe("launchd install", () => { const plistPath = resolveLaunchAgentPlistPath(env); const plist = state.files.get(plistPath) ?? ""; expect(plist).toContain("KeepAlive"); - expect(plist).toContain(""); - expect(plist).not.toContain("SuccessfulExit"); + expect(plist).toContain(""); + expect(plist).toContain("SuccessfulExit"); + expect(plist).toContain(""); expect(plist).toContain("Umask"); expect(plist).toContain(`${LAUNCH_AGENT_UMASK_DECIMAL}`); expect(plist).toContain("ThrottleInterval"); diff --git a/src/daemon/service-audit.ts b/src/daemon/service-audit.ts index 09e766065ec..69698b5a529 100644 --- a/src/daemon/service-audit.ts +++ b/src/daemon/service-audit.ts @@ -171,7 +171,10 @@ async function auditLaunchdPlist( } const hasRunAtLoad = /RunAtLoad<\/key>\s*/i.test(content); - const hasKeepAlive = /KeepAlive<\/key>\s*/i.test(content); + const hasKeepAlive = + /KeepAlive<\/key>\s*(?:|[\s\S]*?SuccessfulExit<\/key>\s*[\s\S]*?<\/dict>)/i.test( + content, + ); if (!hasRunAtLoad) { issues.push({ code: SERVICE_AUDIT_CODES.launchdRunAtLoad, diff --git a/src/daemon/systemd-unit.test.ts b/src/daemon/systemd-unit.test.ts index 5c5562b25e6..aa32f30f2bf 100644 --- a/src/daemon/systemd-unit.test.ts +++ b/src/daemon/systemd-unit.test.ts @@ -21,6 +21,16 @@ describe("buildSystemdUnit", () => { expect(unit).toContain("KillMode=control-group"); }); + it("restarts only on failure", () => { + const unit = buildSystemdUnit({ + description: "OpenClaw Gateway", + programArguments: ["/usr/bin/openclaw", "gateway", "run"], + environment: {}, + }); + expect(unit).toContain("Restart=on-failure"); + expect(unit).not.toContain("Restart=always"); + }); + it("rejects environment values with line breaks", () => { expect(() => buildSystemdUnit({ diff --git a/src/daemon/systemd-unit.ts b/src/daemon/systemd-unit.ts index 9cddbee24d1..4e9d0d72a5f 100644 --- a/src/daemon/systemd-unit.ts +++ b/src/daemon/systemd-unit.ts @@ -57,7 +57,7 @@ export function buildSystemdUnit({ "", "[Service]", `ExecStart=${execStart}`, - "Restart=always", + "Restart=on-failure", "RestartSec=5", // Keep service children in the same lifecycle so restarts do not leave // orphan ACP/runtime workers behind.