fix: repair stale gateway service on start

This commit is contained in:
Peter Steinberger
2026-05-02 20:56:12 +01:00
parent 9eb79bcf99
commit 3b1a020eba
10 changed files with 383 additions and 3 deletions

View File

@@ -31,7 +31,7 @@ import {
} from "./shared.js";
import type { DaemonInstallOptions } from "./types.js";
function mergeInstallInvocationEnv(params: {
export function mergeInstallInvocationEnv(params: {
env: NodeJS.ProcessEnv;
existingServiceEnv?: Record<string, string>;
}): NodeJS.ProcessEnv {

View File

@@ -353,6 +353,55 @@ describe("runServiceRestart token drift", () => {
expect(payload.message).toBe("restart scheduled, gateway will restart momentarily");
});
it("repairs stale loaded services during start before reporting success", async () => {
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway"],
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
});
const repairLoadedService = vi.fn(async () => ({
result: "started" as const,
message: "Gateway service definition repaired and started.",
warnings: ["service was installed by OpenClaw 2026.4.24, current CLI is 2026.5.2"],
loaded: true,
}));
await runServiceStart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
repairLoadedService,
});
expect(repairLoadedService).toHaveBeenCalledTimes(1);
expect(service.restart).not.toHaveBeenCalled();
const payload = readJsonLog<{
result?: string;
message?: string;
warnings?: string[];
service?: { loaded?: boolean };
}>();
expect(payload.result).toBe("started");
expect(payload.message).toBe("Gateway service definition repaired and started.");
expect(payload.warnings?.[0]).toContain("service was installed by OpenClaw");
expect(payload.service?.loaded).toBe(true);
});
it("fails start with an install hint when a stale loaded service has no repair callback", async () => {
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway"],
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
});
await expect(runServiceStart(createServiceRunArgs())).rejects.toThrow("__exit__:1");
const payload = readJsonLog<{ ok?: boolean; error?: string; hints?: string[] }>();
expect(payload.ok).toBe(false);
expect(payload.error).toContain("service needs repair");
expect(payload.hints).toEqual(["openclaw gateway install --force"]);
expect(service.restart).not.toHaveBeenCalled();
});
it("fails start when restarting a stopped installed service errors", async () => {
service.isLoaded.mockResolvedValue(false);
service.restart.mockRejectedValue(new Error("launchctl kickstart failed: permission denied"));

View File

@@ -5,6 +5,7 @@ import { formatConfigIssueLines } from "../../config/issue-format.js";
import { resolveIsNixMode } from "../../config/paths.js";
import { checkTokenDrift } from "../../daemon/service-audit.js";
import type { GatewayServiceRestartResult } from "../../daemon/service-types.js";
import type { GatewayServiceStartRepairIssue, GatewayServiceState } from "../../daemon/service.js";
import { describeGatewayServiceRestart, startGatewayService } from "../../daemon/service.js";
import type { GatewayService } from "../../daemon/service.js";
import { renderSystemdUnavailableHints } from "../../daemon/systemd-hints.js";
@@ -16,6 +17,7 @@ import {
} from "../../infra/restart.js";
import { isWSL } from "../../infra/wsl.js";
import { defaultRuntime } from "../../runtime.js";
import { formatCliCommand } from "../command-format.js";
import { resolveGatewayTokenForDriftCheck } from "./gateway-token-drift.js";
import {
buildDaemonServiceSnapshot,
@@ -48,6 +50,11 @@ type ServiceRecoveryContext = {
fail: (message: string, hints?: string[]) => void;
};
type ServiceStartRepairContext = ServiceRecoveryContext & {
state: GatewayServiceState;
issues: GatewayServiceStartRepairIssue[];
};
async function maybeAugmentSystemdHints(hints: string[]): Promise<string[]> {
if (process.platform !== "linux") {
return hints;
@@ -221,6 +228,7 @@ export async function runServiceStart(params: {
renderStartHints: () => string[];
opts?: DaemonLifecycleOptions;
onNotLoaded?: (ctx: ServiceRecoveryContext) => Promise<ServiceRecoveryResult | null>;
repairLoadedService?: (ctx: ServiceStartRepairContext) => Promise<ServiceRecoveryResult | null>;
}) {
const json = Boolean(params.opts?.json);
const { stdout, emit, fail } = createDaemonActionContext({ action: "start", json });
@@ -298,6 +306,41 @@ export async function runServiceStart(params: {
});
return;
}
if (startResult.outcome === "repair-required") {
try {
const handled = await params.repairLoadedService?.({
json,
stdout,
fail,
state: startResult.state,
issues: startResult.issues,
});
if (handled) {
emit({
ok: true,
result: handled.result,
message: handled.message,
warnings: handled.warnings,
service: buildDaemonServiceSnapshot(params.service, handled.loaded ?? true),
});
if (!json && handled.message) {
defaultRuntime.log(handled.message);
}
return;
}
} catch (err) {
const hints = params.renderStartHints();
fail(`${params.serviceNoun} repair failed: ${String(err)}`, hints);
return;
}
fail(
`${params.serviceNoun} service needs repair before it can start: ${startResult.issues
.map((issue) => issue.message)
.join("; ")}`,
[formatCliCommand("openclaw gateway install --force")],
);
return;
}
emit({
ok: true,
result: "started",

View File

@@ -52,6 +52,7 @@ const probeGateway = vi.fn<
const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true);
const loadConfig = vi.hoisted(() => vi.fn(() => ({})));
const recoverInstalledLaunchAgent = vi.hoisted(() => vi.fn());
const repairLoadedGatewayServiceForStart = vi.hoisted(() => vi.fn());
vi.mock("../../config/config.js", () => ({
getRuntimeConfig: () => loadConfig(),
@@ -89,6 +90,10 @@ vi.mock("./launchd-recovery.js", () => ({
recoverInstalledLaunchAgent(args),
}));
vi.mock("./start-repair.js", () => ({
repairLoadedGatewayServiceForStart: (args: unknown) => repairLoadedGatewayServiceForStart(args),
}));
vi.mock("./restart-health.js", () => ({
DEFAULT_RESTART_HEALTH_ATTEMPTS: 120,
DEFAULT_RESTART_HEALTH_DELAY_MS: 500,
@@ -160,6 +165,7 @@ describe("runDaemonRestart health checks", () => {
isRestartEnabled.mockReset();
loadConfig.mockReset();
recoverInstalledLaunchAgent.mockReset();
repairLoadedGatewayServiceForStart.mockReset();
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway", "--port", "18789"],
@@ -224,6 +230,46 @@ describe("runDaemonRestart health checks", () => {
expect(recoverInstalledLaunchAgent).toHaveBeenCalledWith({ result: "started" });
});
it("repairs stale loaded service definitions from gateway start", async () => {
repairLoadedGatewayServiceForStart.mockResolvedValue({
result: "started",
message: "Gateway service definition repaired and started.",
loaded: true,
});
runServiceStart.mockImplementation(
async (params: {
repairLoadedService?: (args: {
json: boolean;
stdout: NodeJS.WritableStream;
state: unknown;
issues: unknown[];
}) => Promise<unknown>;
}) => {
await params.repairLoadedService?.({
json: true,
stdout: process.stdout,
state: { command: { environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" } } },
issues: [{ code: "version-mismatch", message: "old service" }],
});
},
);
await runDaemonStart({ json: true });
expect(repairLoadedGatewayServiceForStart).toHaveBeenCalledWith(
expect.objectContaining({
service,
json: true,
state: expect.objectContaining({
command: expect.objectContaining({
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
}),
}),
issues: [expect.objectContaining({ code: "version-mismatch" })],
}),
);
});
it("kills stale gateway pids and retries restart", async () => {
const unhealthy: RestartHealthSnapshot = {
healthy: false,

View File

@@ -29,6 +29,7 @@ import {
waitForGatewayHealthyRestart,
} from "./restart-health.js";
import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js";
import { repairLoadedGatewayServiceForStart } from "./start-repair.js";
import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS;
@@ -150,14 +151,23 @@ export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) {
}
export async function runDaemonStart(opts: DaemonLifecycleOptions = {}) {
const service = resolveGatewayService();
return await runServiceStart({
serviceNoun: "Gateway",
service: resolveGatewayService(),
service,
renderStartHints: renderGatewayServiceStartHints,
onNotLoaded:
process.platform === "darwin"
? async () => await recoverInstalledLaunchAgent({ result: "started" })
: undefined,
repairLoadedService: async ({ json, stdout, state, issues }) =>
await repairLoadedGatewayServiceForStart({
service,
json,
stdout,
state,
issues,
}),
opts,
});
}

View File

@@ -0,0 +1,94 @@
import { buildGatewayInstallPlan } from "../../commands/daemon-install-helpers.js";
import { DEFAULT_GATEWAY_DAEMON_RUNTIME } from "../../commands/daemon-runtime.js";
import { resolveGatewayInstallToken } from "../../commands/gateway-install-token.js";
import { readConfigFileSnapshotForWrite } from "../../config/io.js";
import { resolveGatewayPort } from "../../config/paths.js";
import { OPENCLAW_WRAPPER_ENV_KEY, resolveOpenClawWrapperPath } from "../../daemon/program-args.js";
import type { GatewayServiceEnv } from "../../daemon/service-types.js";
import type {
GatewayService,
GatewayServiceStartRepairIssue,
GatewayServiceState,
} from "../../daemon/service.js";
import { formatGatewayServiceStartRepairIssues } from "../../daemon/service.js";
import { defaultRuntime } from "../../runtime.js";
import { mergeInstallInvocationEnv } from "./install.js";
export async function repairLoadedGatewayServiceForStart(params: {
service: GatewayService;
state: GatewayServiceState;
issues: GatewayServiceStartRepairIssue[];
json: boolean;
stdout: NodeJS.WritableStream;
}): Promise<{ result: "started"; message: string; warnings?: string[]; loaded: boolean }> {
const { snapshot: configSnapshot, writeOptions: configWriteOptions } =
await readConfigFileSnapshotForWrite();
const cfg = configSnapshot.valid ? configSnapshot.sourceConfig : configSnapshot.config;
const existingEnvironment = params.state.command?.environment;
const installEnv = mergeInstallInvocationEnv({
env: process.env,
existingServiceEnv: existingEnvironment,
});
const wrapperPath = await resolveOpenClawWrapperPath(installEnv[OPENCLAW_WRAPPER_ENV_KEY]);
const port = resolveGatewayPort(cfg);
const tokenResolution = await resolveGatewayInstallToken({
config: cfg,
configSnapshot,
configWriteOptions,
env: installEnv,
autoGenerateWhenMissing: true,
persistGeneratedToken: true,
});
if (tokenResolution.unavailableReason) {
throw new Error(tokenResolution.unavailableReason);
}
const warnings = [
formatGatewayServiceStartRepairIssues(params.issues),
...tokenResolution.warnings,
].filter((warning) => warning.trim().length > 0);
if (!params.json) {
defaultRuntime.log("Gateway service definition needs repair:");
for (const warning of warnings) {
defaultRuntime.log(`- ${warning}`);
}
}
const { programArguments, workingDirectory, environment } = await buildGatewayInstallPlan({
env: installEnv,
port,
runtime: DEFAULT_GATEWAY_DAEMON_RUNTIME,
wrapperPath,
existingEnvironment,
config: cfg,
warn: (message) => {
warnings.push(message);
if (!params.json) {
defaultRuntime.log(`- ${message}`);
}
},
});
await params.service.install({
env: installEnv as GatewayServiceEnv,
stdout: params.stdout,
programArguments,
workingDirectory,
environment,
});
let loaded = true;
try {
loaded = await params.service.isLoaded({ env: installEnv });
} catch {
loaded = true;
}
return {
result: "started",
message: "Gateway service definition repaired and started.",
warnings: warnings.length ? warnings : undefined,
loaded,
};
}