Daemon: back off unhealthy gateway restarts

This commit is contained in:
Vincent Koc
2026-03-06 14:46:01 -05:00
parent c026374fb1
commit bcb8475691
5 changed files with 24 additions and 10 deletions

View File

@@ -1,9 +1,9 @@
import fs from "node:fs/promises";
// launchd applies ThrottleInterval to any rapid relaunch, including
// intentional gateway restarts. Keep it low so CLI restarts and forced
// reinstalls do not stall for a full minute.
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 1;
// launchd applies ThrottleInterval to any rapid relaunch, including config-crash
// loops. Intentional gateway restarts use launchctl kickstart, so a higher value
// here primarily slows unhealthy restart storms without making operator restarts sluggish.
export const LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS = 30;
// launchd stores plist integer values in decimal; 0o077 renders as 63 (owner-only files).
export const LAUNCH_AGENT_UMASK_DECIMAL = 0o077;
@@ -113,5 +113,5 @@ export function buildLaunchAgentPlist({
? `\n <key>Comment</key>\n <string>${plistEscape(comment.trim())}</string>`
: "";
const envXml = renderEnvDict(environment);
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <true/>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
return `<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n<plist version="1.0">\n <dict>\n <key>Label</key>\n <string>${plistEscape(label)}</string>\n ${commentXml}\n <key>RunAtLoad</key>\n <true/>\n <key>KeepAlive</key>\n <dict>\n <key>SuccessfulExit</key>\n <false/>\n </dict>\n <key>ThrottleInterval</key>\n <integer>${LAUNCH_AGENT_THROTTLE_INTERVAL_SECONDS}</integer>\n <key>Umask</key>\n <integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>\n <key>ProgramArguments</key>\n <array>${argsXml}\n </array>\n ${workingDirXml}\n <key>StandardOutPath</key>\n <string>${plistEscape(stdoutPath)}</string>\n <key>StandardErrorPath</key>\n <string>${plistEscape(stderrPath)}</string>${envXml}\n </dict>\n</plist>\n`;
}

View File

@@ -189,7 +189,7 @@ describe("launchd install", () => {
expect(plist).toContain(`<string>${tmpDir}</string>`);
});
it("writes KeepAlive=true policy with restrictive umask", async () => {
it("writes KeepAlive restart-on-failure policy with restrictive umask", async () => {
const env = createDefaultLaunchdEnv();
await installLaunchAgent({
env,
@@ -200,8 +200,9 @@ describe("launchd install", () => {
const plistPath = resolveLaunchAgentPlistPath(env);
const plist = state.files.get(plistPath) ?? "";
expect(plist).toContain("<key>KeepAlive</key>");
expect(plist).toContain("<true/>");
expect(plist).not.toContain("<key>SuccessfulExit</key>");
expect(plist).toContain("<dict>");
expect(plist).toContain("<key>SuccessfulExit</key>");
expect(plist).toContain("<false/>");
expect(plist).toContain("<key>Umask</key>");
expect(plist).toContain(`<integer>${LAUNCH_AGENT_UMASK_DECIMAL}</integer>`);
expect(plist).toContain("<key>ThrottleInterval</key>");

View File

@@ -171,7 +171,10 @@ async function auditLaunchdPlist(
}
const hasRunAtLoad = /<key>RunAtLoad<\/key>\s*<true\s*\/>/i.test(content);
const hasKeepAlive = /<key>KeepAlive<\/key>\s*<true\s*\/>/i.test(content);
const hasKeepAlive =
/<key>KeepAlive<\/key>\s*(?:<true\s*\/>|<dict>[\s\S]*?<key>SuccessfulExit<\/key>\s*<false\s*\/>[\s\S]*?<\/dict>)/i.test(
content,
);
if (!hasRunAtLoad) {
issues.push({
code: SERVICE_AUDIT_CODES.launchdRunAtLoad,

View File

@@ -21,6 +21,16 @@ describe("buildSystemdUnit", () => {
expect(unit).toContain("KillMode=control-group");
});
it("restarts only on failure", () => {
const unit = buildSystemdUnit({
description: "OpenClaw Gateway",
programArguments: ["/usr/bin/openclaw", "gateway", "run"],
environment: {},
});
expect(unit).toContain("Restart=on-failure");
expect(unit).not.toContain("Restart=always");
});
it("rejects environment values with line breaks", () => {
expect(() =>
buildSystemdUnit({

View File

@@ -57,7 +57,7 @@ export function buildSystemdUnit({
"",
"[Service]",
`ExecStart=${execStart}`,
"Restart=always",
"Restart=on-failure",
"RestartSec=5",
// Keep service children in the same lifecycle so restarts do not leave
// orphan ACP/runtime workers behind.