fix: repair stale gateway service on start

This commit is contained in:
Peter Steinberger
2026-05-02 20:56:12 +01:00
parent 9eb79bcf99
commit 3b1a020eba
10 changed files with 383 additions and 3 deletions

View File

@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Agents/sessions: preserve terminal lifecycle state when final run metadata persists from a stale in-memory snapshot, preventing `main` sessions from staying stuck as running after completed or timed-out turns.
- Gateway/CLI: make `openclaw gateway start` repair stale managed service definitions that point at old OpenClaw versions, missing binaries, or temporary installer paths before starting.
- Status: show the `openai-codex` OAuth profile for `openai/gpt-*` sessions running through the native Codex runtime instead of reporting auth as unknown. (#76197) Thanks @mbelinky.
- Plugins/externalization: keep diagnostics ClawHub packages and persisted bundled-plugin relocation on npm-first install metadata for launch, and omit Discord from the core package now that its external package is published. Thanks @vincentkoc.
- Plugins/Codex: allow the official npm Codex plugin to install without the unsafe-install override, keep `/codex` command ownership, and cover the real npm Docker live path through managed `.openclaw/npm` dependencies plus uninstall failure proof.

View File

@@ -31,7 +31,7 @@ import {
} from "./shared.js";
import type { DaemonInstallOptions } from "./types.js";
function mergeInstallInvocationEnv(params: {
export function mergeInstallInvocationEnv(params: {
env: NodeJS.ProcessEnv;
existingServiceEnv?: Record<string, string>;
}): NodeJS.ProcessEnv {

View File

@@ -353,6 +353,55 @@ describe("runServiceRestart token drift", () => {
expect(payload.message).toBe("restart scheduled, gateway will restart momentarily");
});
it("repairs stale loaded services during start before reporting success", async () => {
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway"],
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
});
const repairLoadedService = vi.fn(async () => ({
result: "started" as const,
message: "Gateway service definition repaired and started.",
warnings: ["service was installed by OpenClaw 2026.4.24, current CLI is 2026.5.2"],
loaded: true,
}));
await runServiceStart({
serviceNoun: "Gateway",
service,
renderStartHints: () => [],
opts: { json: true },
repairLoadedService,
});
expect(repairLoadedService).toHaveBeenCalledTimes(1);
expect(service.restart).not.toHaveBeenCalled();
const payload = readJsonLog<{
result?: string;
message?: string;
warnings?: string[];
service?: { loaded?: boolean };
}>();
expect(payload.result).toBe("started");
expect(payload.message).toBe("Gateway service definition repaired and started.");
expect(payload.warnings?.[0]).toContain("service was installed by OpenClaw");
expect(payload.service?.loaded).toBe(true);
});
it("fails start with an install hint when a stale loaded service has no repair callback", async () => {
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway"],
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
});
await expect(runServiceStart(createServiceRunArgs())).rejects.toThrow("__exit__:1");
const payload = readJsonLog<{ ok?: boolean; error?: string; hints?: string[] }>();
expect(payload.ok).toBe(false);
expect(payload.error).toContain("service needs repair");
expect(payload.hints).toEqual(["openclaw gateway install --force"]);
expect(service.restart).not.toHaveBeenCalled();
});
it("fails start when restarting a stopped installed service errors", async () => {
service.isLoaded.mockResolvedValue(false);
service.restart.mockRejectedValue(new Error("launchctl kickstart failed: permission denied"));

View File

@@ -5,6 +5,7 @@ import { formatConfigIssueLines } from "../../config/issue-format.js";
import { resolveIsNixMode } from "../../config/paths.js";
import { checkTokenDrift } from "../../daemon/service-audit.js";
import type { GatewayServiceRestartResult } from "../../daemon/service-types.js";
import type { GatewayServiceStartRepairIssue, GatewayServiceState } from "../../daemon/service.js";
import { describeGatewayServiceRestart, startGatewayService } from "../../daemon/service.js";
import type { GatewayService } from "../../daemon/service.js";
import { renderSystemdUnavailableHints } from "../../daemon/systemd-hints.js";
@@ -16,6 +17,7 @@ import {
} from "../../infra/restart.js";
import { isWSL } from "../../infra/wsl.js";
import { defaultRuntime } from "../../runtime.js";
import { formatCliCommand } from "../command-format.js";
import { resolveGatewayTokenForDriftCheck } from "./gateway-token-drift.js";
import {
buildDaemonServiceSnapshot,
@@ -48,6 +50,11 @@ type ServiceRecoveryContext = {
fail: (message: string, hints?: string[]) => void;
};
type ServiceStartRepairContext = ServiceRecoveryContext & {
state: GatewayServiceState;
issues: GatewayServiceStartRepairIssue[];
};
async function maybeAugmentSystemdHints(hints: string[]): Promise<string[]> {
if (process.platform !== "linux") {
return hints;
@@ -221,6 +228,7 @@ export async function runServiceStart(params: {
renderStartHints: () => string[];
opts?: DaemonLifecycleOptions;
onNotLoaded?: (ctx: ServiceRecoveryContext) => Promise<ServiceRecoveryResult | null>;
repairLoadedService?: (ctx: ServiceStartRepairContext) => Promise<ServiceRecoveryResult | null>;
}) {
const json = Boolean(params.opts?.json);
const { stdout, emit, fail } = createDaemonActionContext({ action: "start", json });
@@ -298,6 +306,41 @@ export async function runServiceStart(params: {
});
return;
}
if (startResult.outcome === "repair-required") {
try {
const handled = await params.repairLoadedService?.({
json,
stdout,
fail,
state: startResult.state,
issues: startResult.issues,
});
if (handled) {
emit({
ok: true,
result: handled.result,
message: handled.message,
warnings: handled.warnings,
service: buildDaemonServiceSnapshot(params.service, handled.loaded ?? true),
});
if (!json && handled.message) {
defaultRuntime.log(handled.message);
}
return;
}
} catch (err) {
const hints = params.renderStartHints();
fail(`${params.serviceNoun} repair failed: ${String(err)}`, hints);
return;
}
fail(
`${params.serviceNoun} service needs repair before it can start: ${startResult.issues
.map((issue) => issue.message)
.join("; ")}`,
[formatCliCommand("openclaw gateway install --force")],
);
return;
}
emit({
ok: true,
result: "started",

View File

@@ -52,6 +52,7 @@ const probeGateway = vi.fn<
const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true);
const loadConfig = vi.hoisted(() => vi.fn(() => ({})));
const recoverInstalledLaunchAgent = vi.hoisted(() => vi.fn());
const repairLoadedGatewayServiceForStart = vi.hoisted(() => vi.fn());
vi.mock("../../config/config.js", () => ({
getRuntimeConfig: () => loadConfig(),
@@ -89,6 +90,10 @@ vi.mock("./launchd-recovery.js", () => ({
recoverInstalledLaunchAgent(args),
}));
vi.mock("./start-repair.js", () => ({
repairLoadedGatewayServiceForStart: (args: unknown) => repairLoadedGatewayServiceForStart(args),
}));
vi.mock("./restart-health.js", () => ({
DEFAULT_RESTART_HEALTH_ATTEMPTS: 120,
DEFAULT_RESTART_HEALTH_DELAY_MS: 500,
@@ -160,6 +165,7 @@ describe("runDaemonRestart health checks", () => {
isRestartEnabled.mockReset();
loadConfig.mockReset();
recoverInstalledLaunchAgent.mockReset();
repairLoadedGatewayServiceForStart.mockReset();
service.readCommand.mockResolvedValue({
programArguments: ["openclaw", "gateway", "--port", "18789"],
@@ -224,6 +230,46 @@ describe("runDaemonRestart health checks", () => {
expect(recoverInstalledLaunchAgent).toHaveBeenCalledWith({ result: "started" });
});
it("repairs stale loaded service definitions from gateway start", async () => {
repairLoadedGatewayServiceForStart.mockResolvedValue({
result: "started",
message: "Gateway service definition repaired and started.",
loaded: true,
});
runServiceStart.mockImplementation(
async (params: {
repairLoadedService?: (args: {
json: boolean;
stdout: NodeJS.WritableStream;
state: unknown;
issues: unknown[];
}) => Promise<unknown>;
}) => {
await params.repairLoadedService?.({
json: true,
stdout: process.stdout,
state: { command: { environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" } } },
issues: [{ code: "version-mismatch", message: "old service" }],
});
},
);
await runDaemonStart({ json: true });
expect(repairLoadedGatewayServiceForStart).toHaveBeenCalledWith(
expect.objectContaining({
service,
json: true,
state: expect.objectContaining({
command: expect.objectContaining({
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
}),
}),
issues: [expect.objectContaining({ code: "version-mismatch" })],
}),
);
});
it("kills stale gateway pids and retries restart", async () => {
const unhealthy: RestartHealthSnapshot = {
healthy: false,

View File

@@ -29,6 +29,7 @@ import {
waitForGatewayHealthyRestart,
} from "./restart-health.js";
import { parsePortFromArgs, renderGatewayServiceStartHints } from "./shared.js";
import { repairLoadedGatewayServiceForStart } from "./start-repair.js";
import type { DaemonLifecycleOptions } from "./types.js";
const POST_RESTART_HEALTH_ATTEMPTS = DEFAULT_RESTART_HEALTH_ATTEMPTS;
@@ -150,14 +151,23 @@ export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) {
}
export async function runDaemonStart(opts: DaemonLifecycleOptions = {}) {
const service = resolveGatewayService();
return await runServiceStart({
serviceNoun: "Gateway",
service: resolveGatewayService(),
service,
renderStartHints: renderGatewayServiceStartHints,
onNotLoaded:
process.platform === "darwin"
? async () => await recoverInstalledLaunchAgent({ result: "started" })
: undefined,
repairLoadedService: async ({ json, stdout, state, issues }) =>
await repairLoadedGatewayServiceForStart({
service,
json,
stdout,
state,
issues,
}),
opts,
});
}

View File

@@ -0,0 +1,94 @@
import { buildGatewayInstallPlan } from "../../commands/daemon-install-helpers.js";
import { DEFAULT_GATEWAY_DAEMON_RUNTIME } from "../../commands/daemon-runtime.js";
import { resolveGatewayInstallToken } from "../../commands/gateway-install-token.js";
import { readConfigFileSnapshotForWrite } from "../../config/io.js";
import { resolveGatewayPort } from "../../config/paths.js";
import { OPENCLAW_WRAPPER_ENV_KEY, resolveOpenClawWrapperPath } from "../../daemon/program-args.js";
import type { GatewayServiceEnv } from "../../daemon/service-types.js";
import type {
GatewayService,
GatewayServiceStartRepairIssue,
GatewayServiceState,
} from "../../daemon/service.js";
import { formatGatewayServiceStartRepairIssues } from "../../daemon/service.js";
import { defaultRuntime } from "../../runtime.js";
import { mergeInstallInvocationEnv } from "./install.js";
export async function repairLoadedGatewayServiceForStart(params: {
service: GatewayService;
state: GatewayServiceState;
issues: GatewayServiceStartRepairIssue[];
json: boolean;
stdout: NodeJS.WritableStream;
}): Promise<{ result: "started"; message: string; warnings?: string[]; loaded: boolean }> {
const { snapshot: configSnapshot, writeOptions: configWriteOptions } =
await readConfigFileSnapshotForWrite();
const cfg = configSnapshot.valid ? configSnapshot.sourceConfig : configSnapshot.config;
const existingEnvironment = params.state.command?.environment;
const installEnv = mergeInstallInvocationEnv({
env: process.env,
existingServiceEnv: existingEnvironment,
});
const wrapperPath = await resolveOpenClawWrapperPath(installEnv[OPENCLAW_WRAPPER_ENV_KEY]);
const port = resolveGatewayPort(cfg);
const tokenResolution = await resolveGatewayInstallToken({
config: cfg,
configSnapshot,
configWriteOptions,
env: installEnv,
autoGenerateWhenMissing: true,
persistGeneratedToken: true,
});
if (tokenResolution.unavailableReason) {
throw new Error(tokenResolution.unavailableReason);
}
const warnings = [
formatGatewayServiceStartRepairIssues(params.issues),
...tokenResolution.warnings,
].filter((warning) => warning.trim().length > 0);
if (!params.json) {
defaultRuntime.log("Gateway service definition needs repair:");
for (const warning of warnings) {
defaultRuntime.log(`- ${warning}`);
}
}
const { programArguments, workingDirectory, environment } = await buildGatewayInstallPlan({
env: installEnv,
port,
runtime: DEFAULT_GATEWAY_DAEMON_RUNTIME,
wrapperPath,
existingEnvironment,
config: cfg,
warn: (message) => {
warnings.push(message);
if (!params.json) {
defaultRuntime.log(`- ${message}`);
}
},
});
await params.service.install({
env: installEnv as GatewayServiceEnv,
stdout: params.stdout,
programArguments,
workingDirectory,
environment,
});
let loaded = true;
try {
loaded = await params.service.isLoaded({ env: installEnv });
} catch {
loaded = true;
}
return {
result: "started",
message: "Gateway service definition repaired and started.",
warnings: warnings.length ? warnings : undefined,
loaded,
};
}

View File

@@ -48,10 +48,20 @@ export type GatewayServiceState = {
runtime?: GatewayServiceRuntime;
};
export type GatewayServiceStartRepairIssue = {
code: "missing-program" | "temporary-program" | "version-mismatch";
message: string;
};
export type GatewayServiceStartResult =
| { outcome: "started"; state: GatewayServiceState }
| { outcome: "scheduled"; state: GatewayServiceState }
| { outcome: "missing-install"; state: GatewayServiceState };
| { outcome: "missing-install"; state: GatewayServiceState }
| {
outcome: "repair-required";
state: GatewayServiceState;
issues: GatewayServiceStartRepairIssue[];
};
export type GatewayServiceRenderArgs = {
description?: string;

View File

@@ -7,6 +7,7 @@ import { captureEnv } from "../test-utils/env.js";
import type { GatewayService } from "./service.js";
import {
describeGatewayServiceRestart,
formatGatewayServiceStartRepairIssues,
readGatewayServiceState,
resolveGatewayService,
startGatewayService,
@@ -168,6 +169,55 @@ describe("startGatewayService", () => {
expect(result.state.running).toBe(true);
});
it("requests repair before start when the loaded service version is stale", async () => {
const service = createService({
readCommand: vi.fn(async () => ({
programArguments: ["openclaw", "gateway", "run"],
environment: { OPENCLAW_SERVICE_VERSION: "2026.4.24" },
})),
isLoaded: vi.fn(async () => true),
readRuntime: vi.fn(async () => ({ status: "stopped" })),
});
const result = await startGatewayService(service, {
env: {},
stdout: process.stdout,
});
expect(result.outcome).toBe("repair-required");
if (result.outcome === "repair-required") {
expect(formatGatewayServiceStartRepairIssues(result.issues)).toContain(
"service was installed by OpenClaw 2026.4.24",
);
}
expect(service.restart).not.toHaveBeenCalled();
});
it("requests repair before start when the loaded service points at temporary install paths", async () => {
const service = createService({
readCommand: vi.fn(async () => ({
programArguments: [
"/private/tmp/openclaw-ai-install-cli-pr118/tools/node/bin/node",
"/tmp/openclaw-ai-install-cli-pr118/lib/node_modules/openclaw/dist/index.js",
"gateway",
],
environment: {},
})),
isLoaded: vi.fn(async () => true),
});
const result = await startGatewayService(service, {
env: {},
stdout: process.stdout,
});
expect(result.outcome).toBe("repair-required");
if (result.outcome === "repair-required") {
expect(result.issues.map((issue) => issue.code)).toContain("temporary-program");
}
expect(service.restart).not.toHaveBeenCalled();
});
it("falls back to missing-install when restart fails and install artifacts are gone", async () => {
const readCommand = vi
.fn<GatewayService["readCommand"]>()

View File

@@ -1,4 +1,8 @@
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
import { VERSION } from "../version.js";
import { assertFutureConfigActionAllowed } from "./future-config-guard.js";
import {
installLaunchAgent,
@@ -29,6 +33,7 @@ import type {
GatewayServiceInstallArgs,
GatewayServiceManageArgs,
GatewayServiceRestartResult,
GatewayServiceStartRepairIssue,
GatewayServiceStartResult,
GatewayServiceStageArgs,
GatewayServiceState,
@@ -51,6 +56,7 @@ export type {
GatewayServiceInstallArgs,
GatewayServiceManageArgs,
GatewayServiceRestartResult,
GatewayServiceStartRepairIssue,
GatewayServiceStartResult,
GatewayServiceStageArgs,
GatewayServiceState,
@@ -91,6 +97,68 @@ function mergeGatewayServiceEnv(
};
}
const TEMP_PROGRAM_ROOTS = [os.tmpdir(), "/tmp", "/private/tmp", "/var/tmp"].map((entry) =>
path.resolve(entry),
);
function pathIsSameOrChild(candidate: string, parent: string): boolean {
return candidate === parent || candidate.startsWith(`${parent}${path.sep}`);
}
function isTemporaryProgramPath(value: string | undefined): boolean {
if (!value || !path.isAbsolute(value)) {
return false;
}
const resolved = path.resolve(value);
return TEMP_PROGRAM_ROOTS.some((root) => pathIsSameOrChild(resolved, root));
}
function isMissingProgramPath(value: string | undefined): boolean {
if (!value || !path.isAbsolute(value)) {
return false;
}
return !fs.existsSync(value);
}
function collectGatewayServiceStartRepairIssues(
state: GatewayServiceState,
): GatewayServiceStartRepairIssue[] {
const command = state.command;
if (!state.loaded || !command) {
return [];
}
const issues: GatewayServiceStartRepairIssue[] = [];
const serviceVersion = command.environment?.OPENCLAW_SERVICE_VERSION?.trim();
if (serviceVersion && serviceVersion !== VERSION) {
issues.push({
code: "version-mismatch",
message: `service was installed by OpenClaw ${serviceVersion}, current CLI is ${VERSION}`,
});
}
for (const candidate of command.programArguments.slice(0, 2)) {
if (isTemporaryProgramPath(candidate)) {
issues.push({
code: "temporary-program",
message: `service command points at a temporary path: ${candidate}`,
});
continue;
}
if (isMissingProgramPath(candidate)) {
issues.push({
code: "missing-program",
message: `service command points at a missing path: ${candidate}`,
});
}
}
return issues;
}
export function formatGatewayServiceStartRepairIssues(
issues: GatewayServiceStartRepairIssue[],
): string {
return issues.map((issue) => issue.message).join("; ");
}
export async function readGatewayServiceState(
service: GatewayService,
args: GatewayServiceEnvArgs = {},
@@ -124,6 +192,15 @@ export async function startGatewayService(
};
}
const repairIssues = collectGatewayServiceStartRepairIssues(state);
if (repairIssues.length > 0) {
return {
outcome: "repair-required",
state,
issues: repairIssues,
};
}
try {
const restartResult = await service.restart({ ...args, env: state.env });
const nextState = await readGatewayServiceState(service, { env: state.env });