fix(gateway): recover invalid config before startup

This commit is contained in:
Peter Steinberger
2026-04-22 21:53:28 +01:00
parent f70a46b703
commit 64fb6f71b4
6 changed files with 223 additions and 11 deletions

View File

@@ -25,6 +25,12 @@ const configState = vi.hoisted(() => ({
cfg: {} as Record<string, unknown>,
snapshot: { exists: false } as Record<string, unknown>,
}));
const recoverConfigFromLastKnownGood = vi.fn<(params?: unknown) => Promise<boolean>>(
async (_params?: unknown) => false,
);
const writeRestartSentinel = vi.fn<(payload?: unknown) => Promise<string>>(
async (_payload?: unknown) => "/tmp/restart-sentinel.json",
);
const controlUiState = vi.hoisted(() => ({
root: "/tmp/openclaw-control-ui" as string | null,
}));
@@ -33,10 +39,11 @@ const { runtimeErrors, defaultRuntime, resetRuntimeCapture } = createCliRuntimeC
vi.mock("../../config/config.js", () => ({
getConfigPath: () => "/tmp/openclaw-test-missing-config.json",
loadConfig: () => configState.cfg,
readBestEffortConfig: async () => configState.cfg,
readConfigFileSnapshot: async () => configState.snapshot,
recoverConfigFromLastKnownGood: (params: unknown) => recoverConfigFromLastKnownGood(params),
resolveStateDir: () => "/tmp",
resolveGatewayPort: () => 18789,
resolveGatewayPort: (cfg?: { gateway?: { port?: number } }) => cfg?.gateway?.port ?? 18789,
}));
vi.mock("../../gateway/auth.js", () => ({
@@ -90,6 +97,10 @@ vi.mock("../../infra/ports.js", () => ({
inspectPortUsage: async () => ({ status: "free" }),
}));
vi.mock("../../infra/restart-sentinel.js", () => ({
writeRestartSentinel: (payload: unknown) => writeRestartSentinel(payload),
}));
vi.mock("../../logging/console.js", () => ({
setConsoleSubsystemFilter: (filters: string[]) => setConsoleSubsystemFilter(filters),
setConsoleTimestampPrefix: () => undefined,
@@ -100,7 +111,9 @@ vi.mock("../../logging/subsystem.js", () => ({
info: (message: string) => {
gatewayLogMessages.push(message);
},
warn: () => undefined,
warn: (message: string) => {
gatewayLogMessages.push(message);
},
error: () => undefined,
}),
}));
@@ -144,6 +157,10 @@ describe("gateway run option collisions", () => {
configState.snapshot = { exists: false };
controlUiState.root = "/tmp/openclaw-control-ui";
gatewayLogMessages.length = 0;
recoverConfigFromLastKnownGood.mockReset();
recoverConfigFromLastKnownGood.mockResolvedValue(false);
writeRestartSentinel.mockReset();
writeRestartSentinel.mockResolvedValue("/tmp/restart-sentinel.json");
startGatewayServer.mockClear();
setGatewayWsLogStyle.mockClear();
setVerbose.mockClear();
@@ -258,6 +275,119 @@ describe("gateway run option collisions", () => {
expect(startGatewayServer).not.toHaveBeenCalled();
});
it("restores last-known-good config before startup when the effective config is invalid", async () => {
configState.cfg = {};
configState.snapshot = {
exists: true,
valid: false,
path: "/tmp/openclaw-test-missing-config.json",
config: {},
parsed: null,
issues: [{ path: "<root>", message: "JSON5 parse failed" }],
legacyIssues: [],
};
recoverConfigFromLastKnownGood.mockImplementationOnce(async () => {
configState.snapshot = {
exists: true,
valid: true,
path: "/tmp/openclaw-test-missing-config.json",
config: {
gateway: {
mode: "local",
port: 19170,
auth: { mode: "none" },
},
},
parsed: {
gateway: {
mode: "local",
port: 19170,
auth: { mode: "none" },
},
},
issues: [],
legacyIssues: [],
};
return true;
});
await runGatewayCli(["gateway", "run", "--allow-unconfigured"]);
expect(recoverConfigFromLastKnownGood).toHaveBeenCalledWith({
snapshot: expect.objectContaining({
exists: true,
valid: false,
}),
reason: "gateway-run-invalid-config",
});
expect(writeRestartSentinel).toHaveBeenCalledWith({
kind: "config-auto-recovery",
status: "ok",
ts: expect.any(Number),
message:
"Gateway recovered automatically after a failed config change and restored the last known good configuration.",
stats: {
mode: "config-auto-recovery",
reason: "gateway-run-invalid-config",
after: { restoredFrom: "last-known-good" },
},
});
expect(gatewayLogMessages).toContain(
"gateway: restored invalid effective config from last-known-good backup: /tmp/openclaw-test-missing-config.json",
);
expect(startGatewayServer).toHaveBeenCalledWith(
19170,
expect.objectContaining({
bind: "loopback",
auth: undefined,
}),
);
});
it("keeps startup recovery non-fatal when writing the recovery notice fails", async () => {
configState.cfg = {};
configState.snapshot = {
exists: true,
valid: false,
path: "/tmp/openclaw-test-missing-config.json",
config: {},
parsed: null,
issues: [{ path: "<root>", message: "JSON5 parse failed" }],
legacyIssues: [],
};
recoverConfigFromLastKnownGood.mockImplementationOnce(async () => {
configState.snapshot = {
exists: true,
valid: true,
path: "/tmp/openclaw-test-missing-config.json",
config: {
gateway: {
mode: "local",
},
},
parsed: {
gateway: {
mode: "local",
},
},
issues: [],
legacyIssues: [],
};
return true;
});
writeRestartSentinel.mockRejectedValueOnce(new Error("disk full"));
await runGatewayCli(["gateway", "run"]);
expect(startGatewayServer).toHaveBeenCalledWith(
18789,
expect.objectContaining({ bind: "loopback" }),
);
expect(gatewayLogMessages).toContain(
"gateway: failed to persist config auto-recovery notice: disk full",
);
});
it.each(["none", "trusted-proxy"] as const)("accepts --auth %s override", async (mode) => {
await runGatewayCli(["gateway", "run", "--auth", mode, "--allow-unconfigured"]);
@@ -288,7 +418,12 @@ describe("gateway run option collisions", () => {
},
},
};
configState.snapshot = { exists: true, parsed: configState.cfg };
configState.snapshot = {
exists: true,
valid: true,
config: configState.cfg,
parsed: configState.cfg,
};
await runGatewayCli(["gateway", "run", "--allow-unconfigured"]);

View File

@@ -3,14 +3,16 @@ import path from "node:path";
import type { Command } from "commander";
import { readSecretFromFile } from "../../acp/secret-file.js";
import type {
ConfigFileSnapshot,
GatewayAuthMode,
GatewayBindMode,
GatewayTailscaleMode,
} from "../../config/config.js";
import {
CONFIG_PATH,
loadConfig,
readBestEffortConfig,
readConfigFileSnapshot,
recoverConfigFromLastKnownGood,
resolveStateDir,
resolveGatewayPort,
} from "../../config/config.js";
@@ -26,6 +28,7 @@ import { isTruthyEnvValue } from "../../infra/env.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { GatewayLockError } from "../../infra/gateway-lock.js";
import { formatPortDiagnostics, inspectPortUsage } from "../../infra/ports.js";
import { writeRestartSentinel } from "../../infra/restart-sentinel.js";
import { cleanStaleGatewayProcessesSync } from "../../infra/restart-stale-pids.js";
import { detectRespawnSupervisor } from "../../infra/supervisor-markers.js";
import { setConsoleSubsystemFilter, setConsoleTimestampPrefix } from "../../logging/console.js";
@@ -107,6 +110,8 @@ type Awaitable<T> = T | Promise<T>;
* restart storm that can render low-resource hosts unresponsive.
*/
const EXIT_CONFIG_ERROR = 78;
const CONFIG_AUTO_RECOVERY_MESSAGE =
"Gateway recovered automatically after a failed config change and restored the last known good configuration.";
const GATEWAY_AUTH_MODES: readonly GatewayAuthMode[] = [
"none",
@@ -243,6 +248,54 @@ function getGatewayStartGuardErrors(params: {
];
}
async function readGatewayStartupConfig(params: {
startupTrace: ReturnType<typeof createGatewayCliStartupTrace>;
}): Promise<{ cfg: OpenClawConfig; snapshot: ConfigFileSnapshot | null }> {
let cfg = await params.startupTrace.measure("cli.config-load", () => readBestEffortConfig());
let snapshot: ConfigFileSnapshot | null = await params.startupTrace.measure(
"cli.config-snapshot",
() => readConfigFileSnapshot().catch(() => null),
);
if (snapshot?.exists && !snapshot.valid) {
const invalidSnapshot = snapshot;
const recovered = await params.startupTrace.measure("cli.config-recovery", () =>
recoverConfigFromLastKnownGood({
snapshot: invalidSnapshot,
reason: "gateway-run-invalid-config",
}),
);
if (recovered) {
gatewayLog.warn(
`gateway: restored invalid effective config from last-known-good backup: ${invalidSnapshot.path}`,
);
try {
await writeRestartSentinel({
kind: "config-auto-recovery",
status: "ok",
ts: Date.now(),
message: CONFIG_AUTO_RECOVERY_MESSAGE,
stats: {
mode: "config-auto-recovery",
reason: "gateway-run-invalid-config",
after: { restoredFrom: "last-known-good" },
},
});
} catch (err) {
gatewayLog.warn(
`gateway: failed to persist config auto-recovery notice: ${formatErrorMessage(err)}`,
);
}
snapshot = await params.startupTrace.measure("cli.config-snapshot-reload", () =>
readConfigFileSnapshot().catch(() => null),
);
}
}
if (snapshot?.valid) {
cfg = snapshot.config;
}
return { cfg, snapshot };
}
function resolveGatewayRunOptions(opts: GatewayRunOpts, command?: Command): GatewayRunOpts {
const resolved: GatewayRunOpts = { ...opts };
@@ -338,7 +391,7 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
}
gatewayLog.info("loading configuration…");
const cfg = await startupTrace.measure("cli.config-load", () => loadConfig());
const { cfg, snapshot } = await readGatewayStartupConfig({ startupTrace });
maybeLogPendingControlUiBuild(cfg);
const portOverride = parsePort(opts.port);
if (opts.port !== undefined && portOverride === null) {
@@ -461,9 +514,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
const tokenRaw = toOptionString(opts.token);
gatewayLog.info("resolving authentication…");
const snapshot = await startupTrace.measure("cli.config-snapshot", () =>
readConfigFileSnapshot().catch(() => null),
);
const configExists = snapshot?.exists ?? fs.existsSync(CONFIG_PATH);
const configAuditPath = path.join(resolveStateDir(process.env), "logs", "config-audit.jsonl");
const effectiveCfg = snapshot?.valid ? snapshot.config : cfg;

View File

@@ -119,6 +119,12 @@ describe("ensureConfigReady", () => {
const statusRuntime = await runEnsureConfigReady(["status"]);
expect(statusRuntime.exit).not.toHaveBeenCalled();
const bareGatewayRuntime = await runEnsureConfigReady(["gateway"]);
expect(bareGatewayRuntime.exit).not.toHaveBeenCalled();
const gatewayRunRuntime = await runEnsureConfigReady(["gateway", "run"]);
expect(gatewayRunRuntime.exit).not.toHaveBeenCalled();
const gatewayRuntime = await runEnsureConfigReady(["gateway", "health"]);
expect(gatewayRuntime.exit).not.toHaveBeenCalled();
});

View File

@@ -4,6 +4,7 @@ import { shouldMigrateStateFromPath } from "../argv.js";
const ALLOWED_INVALID_COMMANDS = new Set(["doctor", "logs", "health", "help", "status"]);
const ALLOWED_INVALID_GATEWAY_SUBCOMMANDS = new Set([
"run",
"status",
"probe",
"health",
@@ -73,9 +74,12 @@ export async function ensureConfigReady(params: {
const snapshot = preflightSnapshot ?? (await getConfigSnapshot());
const commandName = commandPath[0];
const subcommandName = commandPath[1];
const isBareGatewayForegroundRun =
commandName === "gateway" && (subcommandName === undefined || subcommandName.trim() === "");
const allowInvalid = commandName
? params.allowInvalid === true ||
ALLOWED_INVALID_COMMANDS.has(commandName) ||
isBareGatewayForegroundRun ||
(commandName === "gateway" &&
subcommandName &&
ALLOWED_INVALID_GATEWAY_SUBCOMMANDS.has(subcommandName))

View File

@@ -90,6 +90,20 @@ describe("restart sentinel", () => {
expect(formatRestartSentinelMessage(payload)).toBe("Config updated successfully");
});
it("uses the exact auto-recovery message for config recovery notices", () => {
const payload = {
kind: "config-auto-recovery" as const,
status: "ok" as const,
ts: Date.now(),
message:
"Gateway recovered automatically after a failed config change and restored the last known good configuration.",
stats: { mode: "config-auto-recovery", reason: "gateway-run-invalid-config" },
};
expect(formatRestartSentinelMessage(payload)).toBe(payload.message);
expect(summarizeRestartSentinel(payload)).toBe("Gateway auto-recovery");
});
it("formatRestartSentinelMessage falls back to summary when no message", () => {
const payload = {
kind: "update" as const,

View File

@@ -39,7 +39,7 @@ export type RestartSentinelContinuation =
};
export type RestartSentinelPayload = {
kind: "config-apply" | "config-patch" | "update" | "restart";
kind: "config-apply" | "config-auto-recovery" | "config-patch" | "update" | "restart";
status: "ok" | "error" | "skipped";
ts: number;
sessionKey?: string;
@@ -130,7 +130,7 @@ export async function consumeRestartSentinel(
export function formatRestartSentinelMessage(payload: RestartSentinelPayload): string {
const message = payload.message?.trim();
if (message && !payload.stats) {
if (message && (!payload.stats || payload.kind === "config-auto-recovery")) {
return message;
}
const lines: string[] = [summarizeRestartSentinel(payload)];
@@ -148,6 +148,9 @@ export function formatRestartSentinelMessage(payload: RestartSentinelPayload): s
}
export function summarizeRestartSentinel(payload: RestartSentinelPayload): string {
if (payload.kind === "config-auto-recovery") {
return "Gateway auto-recovery";
}
const kind = payload.kind;
const status = payload.status;
const mode = payload.stats?.mode ? ` (${payload.stats.mode})` : "";