fix: log detached service restart attempts

This commit is contained in:
Peter Steinberger
2026-04-18 19:06:57 +01:00
parent 28be124cc1
commit 438799e929
6 changed files with 132 additions and 48 deletions

View File

@@ -75,7 +75,7 @@ exit 0
const pollAttemptIncrement = "set /a attempts+=1";
const pollNetstatCheck = `netstat -ano | findstr /R /C:":${port} .*LISTENING" >nul`;
const forceKillLabel = ":force_kill_listener";
const forceKillCommand = "taskkill /F /PID %%P >nul 2>&1";
const forceKillCommand = "taskkill /F /PID %%P >>";
const portReleasedLabel = ":port_released";
const runCommand = 'schtasks /Run /TN "';
const endIndex = content.indexOf(endCommand);
@@ -151,7 +151,7 @@ exit 0
await cleanupScript(scriptPath);
});
it("captures macOS launchctl stderr to ~/.openclaw/logs/update-restart.log (#68486)", async () => {
it("captures macOS launchctl stderr to ~/.openclaw/logs/gateway-restart.log (#68486)", async () => {
// Silent failure in macOS update restart helper: previously every
// launchctl call redirected stderr to /dev/null and the final kickstart
// was chained with `|| true`, so bootstrap/kickstart failures were
@@ -166,7 +166,7 @@ exit 0
HOME: "/Users/testuser",
});
expect(content).toContain(
"exec >>'/Users/testuser/.openclaw/logs/update-restart.log' 2>&1 || true",
"exec >>'/Users/testuser/.openclaw/logs/gateway-restart.log' 2>&1 || true",
);
// Every launchctl call should allow output through now (no `2>/dev/null`)
// and the final kickstart must not swallow its exit code.
@@ -187,7 +187,7 @@ exit 0
expect(content).toContain("mkdir -p '/tmp/openclaw-state/logs' 2>/dev/null || true");
expect(content).toContain(
"exec >>'/tmp/openclaw-state/logs/update-restart.log' 2>&1 || true",
"exec >>'/tmp/openclaw-state/logs/gateway-restart.log' 2>&1 || true",
);
await cleanupScript(scriptPath);
});
@@ -220,13 +220,13 @@ exit 0
const result = await executeScript(scriptPath, {
PATH: `${fakeBinDir}:${process.env.PATH ?? ""}`,
});
const log = await fs.readFile(path.join(stateDir, "logs", "update-restart.log"), "utf-8");
const log = await fs.readFile(path.join(stateDir, "logs", "gateway-restart.log"), "utf-8");
expect(result.code).toBe(42);
expect(log).toContain("openclaw update restart attempt (label=ai.openclaw.gateway)");
expect(log).toContain("openclaw restart attempt source=update target=ai.openclaw.gateway");
expect(log).toContain("launchctl kickstart -k gui/501/ai.openclaw.gateway");
expect(log).toContain("openclaw update restart failed status=42");
expect(log).not.toContain("openclaw update restart done");
expect(log).toContain("openclaw restart failed source=update status=42");
expect(log).not.toContain("openclaw restart done source=update");
});
it("continues the macOS restart path when log setup fails", async () => {
@@ -279,11 +279,11 @@ exit 0
const result = await executeScript(scriptPath, {
PATH: `${fakeBinDir}:${process.env.PATH ?? ""}`,
});
const log = await fs.readFile(path.join(stateDir, "logs", "update-restart.log"), "utf-8");
const log = await fs.readFile(path.join(stateDir, "logs", "gateway-restart.log"), "utf-8");
expect(result.code).toBeNull();
expect(log).toContain("label=ai.openclaw.$(echo injected)");
expect(log).not.toContain("label=ai.openclaw.injected");
expect(log).toContain("target=ai.openclaw.$(echo injected)");
expect(log).not.toContain("target=ai.openclaw.injected");
});
it("uses OPENCLAW_LAUNCHD_LABEL override on macOS", async () => {
@@ -306,8 +306,10 @@ exit 0
});
expect(scriptPath.endsWith(".bat")).toBe(true);
expect(content).toContain("@echo off");
expect(content).toContain("gateway-restart.log");
expect(content).toContain("openclaw restart attempt source=update target=OpenClaw Gateway");
expect(content).toContain('schtasks /End /TN "OpenClaw Gateway"');
expect(content).toContain('schtasks /Run /TN "OpenClaw Gateway"');
expect(content).toContain('schtasks /Run /TN "OpenClaw Gateway" >>');
expectWindowsRestartWaitOrdering(content);
// Batch self-cleanup
expect(content).toContain('del "%~f0"');

View File

@@ -2,13 +2,18 @@ import { spawn } from "node:child_process";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { DEFAULT_GATEWAY_PORT, resolveStateDir } from "../../config/paths.js";
import { DEFAULT_GATEWAY_PORT } from "../../config/paths.js";
import { quoteCmdScriptArg } from "../../daemon/cmd-argv.js";
import {
resolveGatewayLaunchAgentLabel,
resolveGatewaySystemdServiceName,
resolveGatewayWindowsTaskName,
} from "../../daemon/constants.js";
import {
renderCmdRestartLogSetup,
renderPosixRestartLogSetup,
shellEscapeRestartLogValue,
} from "../../daemon/restart-logs.js";
import { normalizeOptionalString } from "../../shared/string-coerce.js";
/**
@@ -71,14 +76,24 @@ export async function prepareRestartScript(
if (platform === "linux") {
const unitName = resolveSystemdUnit(env);
const escaped = shellEscape(unitName);
const logSetup = renderPosixRestartLogSetup({ ...process.env, ...env });
filename = `openclaw-restart-${timestamp}.sh`;
scriptContent = `#!/bin/sh
# Standalone restart script — survives parent process termination.
# Wait briefly to ensure file locks are released after update.
sleep 1
systemctl --user restart '${escaped}'
${logSetup}
printf '[%s] openclaw restart attempt source=update target=%s\\n' "$(date -u +%FT%TZ)" '${escaped}' >&2
if systemctl --user restart '${escaped}'; then
status=0
printf '[%s] openclaw restart done source=update\\n' "$(date -u +%FT%TZ)" >&2
else
status=$?
printf '[%s] openclaw restart failed source=update status=%s\\n' "$(date -u +%FT%TZ)" "$status" >&2
fi
# Self-cleanup
rm -f "$0"
exit "$status"
`;
} else if (platform === "darwin") {
const label = resolveLaunchdLabel(env);
@@ -90,10 +105,7 @@ rm -f "$0"
const home = normalizeOptionalString(env.HOME) || process.env.HOME || os.homedir();
const plistPath = path.join(home, "Library", "LaunchAgents", `${label}.plist`);
const escapedPlistPath = shellEscape(plistPath);
const logDir = path.join(resolveStateDir(env), "logs");
const logPath = path.join(logDir, "update-restart.log");
const escapedLogDir = shellEscape(logDir);
const escapedLogPath = shellEscape(logPath);
const logSetup = renderPosixRestartLogSetup({ ...process.env, ...env });
filename = `openclaw-restart-${timestamp}.sh`;
scriptContent = `#!/bin/sh
# Standalone restart script — survives parent process termination.
@@ -102,9 +114,8 @@ sleep 1
# Capture launchctl output so bootstrap/kickstart failures leave a durable
# audit trail. Log setup is best-effort: restart must still run if the log path
# is temporarily unavailable.
mkdir -p '${escapedLogDir}' 2>/dev/null || true
exec >>'${escapedLogPath}' 2>&1 || true
printf '[%s] openclaw update restart attempt (label=%s)\\n' "$(date -u +%FT%TZ)" '${escaped}' >&2
${logSetup}
printf '[%s] openclaw restart attempt source=update target=%s\\n' "$(date -u +%FT%TZ)" '${shellEscapeRestartLogValue(label)}' >&2
# Try kickstart first (works when the service is still registered).
# If it fails (e.g. after bootout), clear any persisted disabled state,
# then re-register via bootstrap and kickstart. The final status is captured
@@ -117,9 +128,9 @@ if ! launchctl kickstart -k 'gui/${uid}/${escaped}'; then
status=$?
fi
if [ "$status" -eq 0 ]; then
printf '[%s] openclaw update restart done\\n' "$(date -u +%FT%TZ)" >&2
printf '[%s] openclaw restart done source=update\\n' "$(date -u +%FT%TZ)" >&2
else
printf '[%s] openclaw update restart failed status=%s\\n' "$(date -u +%FT%TZ)" "$status" >&2
printf '[%s] openclaw restart failed source=update status=%s\\n' "$(date -u +%FT%TZ)" "$status" >&2
fi
# Self-cleanup (log is retained under the OpenClaw state logs directory).
rm -f "$0"
@@ -132,12 +143,15 @@ exit "$status"
}
const port =
Number.isFinite(gatewayPort) && gatewayPort > 0 ? gatewayPort : DEFAULT_GATEWAY_PORT;
const restartLog = renderCmdRestartLogSetup({ ...process.env, ...env });
filename = `openclaw-restart-${timestamp}.bat`;
scriptContent = `@echo off
REM Standalone restart script — survives parent process termination.
REM Wait briefly to ensure file locks are released after update.
timeout /t 2 /nobreak >nul
schtasks /End /TN "${taskName}"
${restartLog.lines.join("\r\n")}
>> ${restartLog.quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart attempt source=update target=${taskName}
schtasks /End /TN "${taskName}" >> ${restartLog.quotedLogPath} 2>&1
REM Poll for gateway port release before rerun; force-kill listener if stuck.
set /a attempts=0
:wait_for_port_release
@@ -149,13 +163,20 @@ timeout /t 1 /nobreak >nul
goto wait_for_port_release
:force_kill_listener
for /f "tokens=5" %%P in ('netstat -ano ^| findstr /R /C:":${port} .*LISTENING"') do (
taskkill /F /PID %%P >nul 2>&1
taskkill /F /PID %%P >> ${restartLog.quotedLogPath} 2>&1
goto port_released
)
:port_released
schtasks /Run /TN "${taskName}"
schtasks /Run /TN "${taskName}" >> ${restartLog.quotedLogPath} 2>&1
set "status=%ERRORLEVEL%"
if not "%status%"=="0" (
>> ${restartLog.quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart failed source=update status=%status%
) else (
>> ${restartLog.quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart done source=update
)
REM Self-cleanup
del "%~f0"
exit /b %status%
`;
} else {
return null;

View File

@@ -41,10 +41,13 @@ describe("scheduleDetachedLaunchdRestartHandoff", () => {
expect(args[6]).toBe("9876");
expect(args[7]).toBe("ai.openclaw.gateway");
expect(args[1]).toContain('while kill -0 "$wait_pid" >/dev/null 2>&1; do');
expect(args[1]).toContain('launchctl enable "$service_target" >/dev/null 2>&1');
expect(args[1]).toContain(
'if ! launchctl kickstart -k "$service_target" >/dev/null 2>&1; then',
"exec >>'/Users/test/.openclaw/logs/gateway-restart.log' 2>&1 || true",
);
expect(args[1]).toContain("openclaw restart attempt source=launchd-handoff mode=kickstart");
expect(args[1]).toContain('launchctl enable "$service_target"');
expect(args[1]).toContain('if launchctl kickstart -k "$service_target"; then');
expect(args[1]).not.toMatch(/launchctl[^\n]*\/dev\/null/);
expect(args[1]).not.toContain("sleep 1");
expect(unrefMock).toHaveBeenCalledTimes(1);
});
@@ -62,7 +65,7 @@ describe("scheduleDetachedLaunchdRestartHandoff", () => {
const [, args] = spawnMock.mock.calls[0] as [string, string[]];
expect(args[7]).toBe("ai.openclaw.gateway");
expect(args[1]).toContain('launchctl start "$label" >/dev/null 2>&1');
expect(args[1]).toContain('if launchctl start "$label"; then');
expect(args[1]).not.toContain('basename "$service_target"');
});

View File

@@ -5,6 +5,7 @@ import { formatErrorMessage } from "../infra/errors.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
import { sanitizeForLog } from "../terminal/ansi.js";
import { resolveGatewayLaunchAgentLabel } from "./constants.js";
import { renderPosixRestartLogSetup } from "./restart-logs.js";
export type LaunchdRestartHandoffMode = "kickstart" | "start-after-exit";
@@ -74,9 +75,14 @@ export function isCurrentProcessLaunchdServiceLabel(
return Boolean(configuredLabel && configuredLabel === label);
}
function buildLaunchdRestartScript(mode: LaunchdRestartHandoffMode): string {
function buildLaunchdRestartScript(
mode: LaunchdRestartHandoffMode,
env: Record<string, string | undefined>,
): string {
const waitForCallerPid = `wait_pid="$4"
label="$5"
${renderPosixRestartLogSetup(env)}
printf '[%s] openclaw restart attempt source=launchd-handoff mode=${mode} target=%s waitPid=%s\\n' "$(date -u +%FT%TZ)" "$service_target" "$wait_pid" >&2
if [ -n "$wait_pid" ] && [ "$wait_pid" -gt 1 ] 2>/dev/null; then
while kill -0 "$wait_pid" >/dev/null 2>&1; do
sleep 0.1
@@ -90,12 +96,23 @@ fi
domain="$2"
plist_path="$3"
${waitForCallerPid}
launchctl enable "$service_target" >/dev/null 2>&1
if ! launchctl kickstart -k "$service_target" >/dev/null 2>&1; then
if launchctl bootstrap "$domain" "$plist_path" >/dev/null 2>&1; then
launchctl kickstart -k "$service_target" >/dev/null 2>&1 || true
status=0
launchctl enable "$service_target"
if launchctl kickstart -k "$service_target"; then
status=0
else
status=$?
if launchctl bootstrap "$domain" "$plist_path"; then
launchctl kickstart -k "$service_target"
status=$?
fi
fi
if [ "$status" -eq 0 ]; then
printf '[%s] openclaw restart done source=launchd-handoff mode=${mode}\\n' "$(date -u +%FT%TZ)" >&2
else
printf '[%s] openclaw restart failed source=launchd-handoff mode=${mode} status=%s\\n' "$(date -u +%FT%TZ)" "$status" >&2
fi
exit "$status"
`;
}
@@ -104,14 +121,30 @@ fi
domain="$2"
plist_path="$3"
${waitForCallerPid}
launchctl enable "$service_target" >/dev/null 2>&1
if ! launchctl start "$label" >/dev/null 2>&1; then
if launchctl bootstrap "$domain" "$plist_path" >/dev/null 2>&1; then
launchctl start "$label" >/dev/null 2>&1 || launchctl kickstart -k "$service_target" >/dev/null 2>&1 || true
status=0
launchctl enable "$service_target"
if launchctl start "$label"; then
status=0
else
status=$?
if launchctl bootstrap "$domain" "$plist_path"; then
if launchctl start "$label"; then
status=0
else
launchctl kickstart -k "$service_target"
status=$?
fi
else
launchctl kickstart -k "$service_target" >/dev/null 2>&1 || true
launchctl kickstart -k "$service_target"
status=$?
fi
fi
if [ "$status" -eq 0 ]; then
printf '[%s] openclaw restart done source=launchd-handoff mode=${mode}\\n' "$(date -u +%FT%TZ)" >&2
else
printf '[%s] openclaw restart failed source=launchd-handoff mode=${mode} status=%s\\n' "$(date -u +%FT%TZ)" "$status" >&2
fi
exit "$status"
`;
}
@@ -125,12 +158,13 @@ export function scheduleDetachedLaunchdRestartHandoff(params: {
typeof params.waitForPid === "number" && Number.isFinite(params.waitForPid)
? Math.floor(params.waitForPid)
: 0;
const restartEnv = { ...process.env, ...params.env };
try {
const child = spawn(
"/bin/sh",
[
"-c",
buildLaunchdRestartScript(params.mode),
buildLaunchdRestartScript(params.mode, restartEnv),
"openclaw-launchd-restart-handoff",
target.serviceTarget,
target.domain,
@@ -141,7 +175,7 @@ export function scheduleDetachedLaunchdRestartHandoff(params: {
{
detached: true,
stdio: "ignore",
env: { ...process.env, ...params.env },
env: restartEnv,
},
);
child.unref();

View File

@@ -113,7 +113,11 @@ describe("relaunchGatewayScheduledTask", () => {
expect(scriptPath).toBeTruthy();
const script = fs.readFileSync(scriptPath, "utf8");
expect(script).toContain("timeout /t 1 /nobreak >nul");
expect(script).toContain('schtasks /Run /TN "OpenClaw Gateway (work)" >nul 2>&1');
expect(script).toContain("gateway-restart.log");
expect(script).toContain(
'openclaw restart attempt source=windows-task-handoff target="OpenClaw Gateway (work)"',
);
expect(script).toContain('schtasks /Run /TN "OpenClaw Gateway (work)" >>');
expect(script).toContain('del "%~f0" >nul 2>&1');
});
@@ -130,7 +134,7 @@ describe("relaunchGatewayScheduledTask", () => {
const scriptPath = [...createdScriptPaths][0];
const script = fs.readFileSync(scriptPath, "utf8");
expect(script).toContain('schtasks /Run /TN "OpenClaw Gateway (custom)" >nul 2>&1');
expect(script).toContain('schtasks /Run /TN "OpenClaw Gateway (custom)" >>');
});
it("returns failed when the helper cannot be spawned", () => {

View File

@@ -4,6 +4,7 @@ import fs from "node:fs";
import path from "node:path";
import { quoteCmdScriptArg } from "../daemon/cmd-argv.js";
import { resolveGatewayWindowsTaskName } from "../daemon/constants.js";
import { renderCmdRestartLogSetup } from "../daemon/restart-logs.js";
import { resolveTaskScriptPath } from "../daemon/schtasks.js";
import { formatErrorMessage } from "./errors.js";
import type { RestartAttempt } from "./restart.types.js";
@@ -20,28 +21,41 @@ function resolveWindowsTaskName(env: NodeJS.ProcessEnv): string {
return resolveGatewayWindowsTaskName(env.OPENCLAW_PROFILE);
}
function buildScheduledTaskRestartScript(taskName: string, taskScriptPath?: string): string {
function buildScheduledTaskRestartScript(params: {
quotedLogPath: string;
setupLines: string[];
taskName: string;
taskScriptPath?: string;
}): string {
const { quotedLogPath, setupLines, taskName, taskScriptPath } = params;
const quotedTaskName = quoteCmdScriptArg(taskName);
const lines = [
"@echo off",
"setlocal",
`schtasks /Query /TN ${quotedTaskName} >nul 2>&1`,
...setupLines,
`>> ${quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart attempt source=windows-task-handoff target=${quotedTaskName}`,
`schtasks /Query /TN ${quotedTaskName} >> ${quotedLogPath} 2>&1`,
"if errorlevel 1 goto fallback",
"set /a attempts=0",
":retry",
`timeout /t ${TASK_RESTART_RETRY_DELAY_SEC} /nobreak >nul`,
"set /a attempts+=1",
`schtasks /Run /TN ${quotedTaskName} >nul 2>&1`,
`schtasks /Run /TN ${quotedTaskName} >> ${quotedLogPath} 2>&1`,
"if not errorlevel 1 goto cleanup",
`if %attempts% GEQ ${TASK_RESTART_RETRY_LIMIT} goto fallback`,
"goto retry",
":fallback",
`>> ${quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart fallback source=windows-task-handoff`,
];
if (taskScriptPath) {
const quotedScript = quoteCmdScriptArg(taskScriptPath);
lines.push(`if exist ${quotedScript} (`, ` start "" /min cmd.exe /d /c ${quotedScript}`, ")");
}
lines.push(":cleanup", 'del "%~f0" >nul 2>&1');
lines.push(
":cleanup",
`>> ${quotedLogPath} 2>&1 echo [%DATE% %TIME%] openclaw restart finished source=windows-task-handoff`,
'del "%~f0" >nul 2>&1',
);
return lines.join("\r\n");
}
@@ -53,10 +67,16 @@ export function relaunchGatewayScheduledTask(env: NodeJS.ProcessEnv = process.en
`openclaw-schtasks-restart-${randomUUID()}.cmd`,
);
const quotedScriptPath = quoteCmdScriptArg(scriptPath);
const restartLog = renderCmdRestartLogSetup({ ...process.env, ...env });
try {
fs.writeFileSync(
scriptPath,
`${buildScheduledTaskRestartScript(taskName, taskScriptPath)}\r\n`,
`${buildScheduledTaskRestartScript({
quotedLogPath: restartLog.quotedLogPath,
setupLines: restartLog.lines,
taskName,
taskScriptPath,
})}\r\n`,
"utf8",
);
const child = spawn("cmd.exe", ["/d", "/s", "/c", quotedScriptPath], {