mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 13:20:43 +00:00
fix(gateway): recover invalid config before startup
This commit is contained in:
@@ -25,6 +25,12 @@ const configState = vi.hoisted(() => ({
|
||||
cfg: {} as Record<string, unknown>,
|
||||
snapshot: { exists: false } as Record<string, unknown>,
|
||||
}));
|
||||
const recoverConfigFromLastKnownGood = vi.fn<(params?: unknown) => Promise<boolean>>(
|
||||
async (_params?: unknown) => false,
|
||||
);
|
||||
const writeRestartSentinel = vi.fn<(payload?: unknown) => Promise<string>>(
|
||||
async (_payload?: unknown) => "/tmp/restart-sentinel.json",
|
||||
);
|
||||
const controlUiState = vi.hoisted(() => ({
|
||||
root: "/tmp/openclaw-control-ui" as string | null,
|
||||
}));
|
||||
@@ -33,10 +39,11 @@ const { runtimeErrors, defaultRuntime, resetRuntimeCapture } = createCliRuntimeC
|
||||
|
||||
vi.mock("../../config/config.js", () => ({
|
||||
getConfigPath: () => "/tmp/openclaw-test-missing-config.json",
|
||||
loadConfig: () => configState.cfg,
|
||||
readBestEffortConfig: async () => configState.cfg,
|
||||
readConfigFileSnapshot: async () => configState.snapshot,
|
||||
recoverConfigFromLastKnownGood: (params: unknown) => recoverConfigFromLastKnownGood(params),
|
||||
resolveStateDir: () => "/tmp",
|
||||
resolveGatewayPort: () => 18789,
|
||||
resolveGatewayPort: (cfg?: { gateway?: { port?: number } }) => cfg?.gateway?.port ?? 18789,
|
||||
}));
|
||||
|
||||
vi.mock("../../gateway/auth.js", () => ({
|
||||
@@ -90,6 +97,10 @@ vi.mock("../../infra/ports.js", () => ({
|
||||
inspectPortUsage: async () => ({ status: "free" }),
|
||||
}));
|
||||
|
||||
vi.mock("../../infra/restart-sentinel.js", () => ({
|
||||
writeRestartSentinel: (payload: unknown) => writeRestartSentinel(payload),
|
||||
}));
|
||||
|
||||
vi.mock("../../logging/console.js", () => ({
|
||||
setConsoleSubsystemFilter: (filters: string[]) => setConsoleSubsystemFilter(filters),
|
||||
setConsoleTimestampPrefix: () => undefined,
|
||||
@@ -100,7 +111,9 @@ vi.mock("../../logging/subsystem.js", () => ({
|
||||
info: (message: string) => {
|
||||
gatewayLogMessages.push(message);
|
||||
},
|
||||
warn: () => undefined,
|
||||
warn: (message: string) => {
|
||||
gatewayLogMessages.push(message);
|
||||
},
|
||||
error: () => undefined,
|
||||
}),
|
||||
}));
|
||||
@@ -144,6 +157,10 @@ describe("gateway run option collisions", () => {
|
||||
configState.snapshot = { exists: false };
|
||||
controlUiState.root = "/tmp/openclaw-control-ui";
|
||||
gatewayLogMessages.length = 0;
|
||||
recoverConfigFromLastKnownGood.mockReset();
|
||||
recoverConfigFromLastKnownGood.mockResolvedValue(false);
|
||||
writeRestartSentinel.mockReset();
|
||||
writeRestartSentinel.mockResolvedValue("/tmp/restart-sentinel.json");
|
||||
startGatewayServer.mockClear();
|
||||
setGatewayWsLogStyle.mockClear();
|
||||
setVerbose.mockClear();
|
||||
@@ -258,6 +275,119 @@ describe("gateway run option collisions", () => {
|
||||
expect(startGatewayServer).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("restores last-known-good config before startup when the effective config is invalid", async () => {
|
||||
configState.cfg = {};
|
||||
configState.snapshot = {
|
||||
exists: true,
|
||||
valid: false,
|
||||
path: "/tmp/openclaw-test-missing-config.json",
|
||||
config: {},
|
||||
parsed: null,
|
||||
issues: [{ path: "<root>", message: "JSON5 parse failed" }],
|
||||
legacyIssues: [],
|
||||
};
|
||||
recoverConfigFromLastKnownGood.mockImplementationOnce(async () => {
|
||||
configState.snapshot = {
|
||||
exists: true,
|
||||
valid: true,
|
||||
path: "/tmp/openclaw-test-missing-config.json",
|
||||
config: {
|
||||
gateway: {
|
||||
mode: "local",
|
||||
port: 19170,
|
||||
auth: { mode: "none" },
|
||||
},
|
||||
},
|
||||
parsed: {
|
||||
gateway: {
|
||||
mode: "local",
|
||||
port: 19170,
|
||||
auth: { mode: "none" },
|
||||
},
|
||||
},
|
||||
issues: [],
|
||||
legacyIssues: [],
|
||||
};
|
||||
return true;
|
||||
});
|
||||
|
||||
await runGatewayCli(["gateway", "run", "--allow-unconfigured"]);
|
||||
|
||||
expect(recoverConfigFromLastKnownGood).toHaveBeenCalledWith({
|
||||
snapshot: expect.objectContaining({
|
||||
exists: true,
|
||||
valid: false,
|
||||
}),
|
||||
reason: "gateway-run-invalid-config",
|
||||
});
|
||||
expect(writeRestartSentinel).toHaveBeenCalledWith({
|
||||
kind: "config-auto-recovery",
|
||||
status: "ok",
|
||||
ts: expect.any(Number),
|
||||
message:
|
||||
"Gateway recovered automatically after a failed config change and restored the last known good configuration.",
|
||||
stats: {
|
||||
mode: "config-auto-recovery",
|
||||
reason: "gateway-run-invalid-config",
|
||||
after: { restoredFrom: "last-known-good" },
|
||||
},
|
||||
});
|
||||
expect(gatewayLogMessages).toContain(
|
||||
"gateway: restored invalid effective config from last-known-good backup: /tmp/openclaw-test-missing-config.json",
|
||||
);
|
||||
expect(startGatewayServer).toHaveBeenCalledWith(
|
||||
19170,
|
||||
expect.objectContaining({
|
||||
bind: "loopback",
|
||||
auth: undefined,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("keeps startup recovery non-fatal when writing the recovery notice fails", async () => {
|
||||
configState.cfg = {};
|
||||
configState.snapshot = {
|
||||
exists: true,
|
||||
valid: false,
|
||||
path: "/tmp/openclaw-test-missing-config.json",
|
||||
config: {},
|
||||
parsed: null,
|
||||
issues: [{ path: "<root>", message: "JSON5 parse failed" }],
|
||||
legacyIssues: [],
|
||||
};
|
||||
recoverConfigFromLastKnownGood.mockImplementationOnce(async () => {
|
||||
configState.snapshot = {
|
||||
exists: true,
|
||||
valid: true,
|
||||
path: "/tmp/openclaw-test-missing-config.json",
|
||||
config: {
|
||||
gateway: {
|
||||
mode: "local",
|
||||
},
|
||||
},
|
||||
parsed: {
|
||||
gateway: {
|
||||
mode: "local",
|
||||
},
|
||||
},
|
||||
issues: [],
|
||||
legacyIssues: [],
|
||||
};
|
||||
return true;
|
||||
});
|
||||
writeRestartSentinel.mockRejectedValueOnce(new Error("disk full"));
|
||||
|
||||
await runGatewayCli(["gateway", "run"]);
|
||||
|
||||
expect(startGatewayServer).toHaveBeenCalledWith(
|
||||
18789,
|
||||
expect.objectContaining({ bind: "loopback" }),
|
||||
);
|
||||
expect(gatewayLogMessages).toContain(
|
||||
"gateway: failed to persist config auto-recovery notice: disk full",
|
||||
);
|
||||
});
|
||||
|
||||
it.each(["none", "trusted-proxy"] as const)("accepts --auth %s override", async (mode) => {
|
||||
await runGatewayCli(["gateway", "run", "--auth", mode, "--allow-unconfigured"]);
|
||||
|
||||
@@ -288,7 +418,12 @@ describe("gateway run option collisions", () => {
|
||||
},
|
||||
},
|
||||
};
|
||||
configState.snapshot = { exists: true, parsed: configState.cfg };
|
||||
configState.snapshot = {
|
||||
exists: true,
|
||||
valid: true,
|
||||
config: configState.cfg,
|
||||
parsed: configState.cfg,
|
||||
};
|
||||
|
||||
await runGatewayCli(["gateway", "run", "--allow-unconfigured"]);
|
||||
|
||||
|
||||
@@ -3,14 +3,16 @@ import path from "node:path";
|
||||
import type { Command } from "commander";
|
||||
import { readSecretFromFile } from "../../acp/secret-file.js";
|
||||
import type {
|
||||
ConfigFileSnapshot,
|
||||
GatewayAuthMode,
|
||||
GatewayBindMode,
|
||||
GatewayTailscaleMode,
|
||||
} from "../../config/config.js";
|
||||
import {
|
||||
CONFIG_PATH,
|
||||
loadConfig,
|
||||
readBestEffortConfig,
|
||||
readConfigFileSnapshot,
|
||||
recoverConfigFromLastKnownGood,
|
||||
resolveStateDir,
|
||||
resolveGatewayPort,
|
||||
} from "../../config/config.js";
|
||||
@@ -26,6 +28,7 @@ import { isTruthyEnvValue } from "../../infra/env.js";
|
||||
import { formatErrorMessage } from "../../infra/errors.js";
|
||||
import { GatewayLockError } from "../../infra/gateway-lock.js";
|
||||
import { formatPortDiagnostics, inspectPortUsage } from "../../infra/ports.js";
|
||||
import { writeRestartSentinel } from "../../infra/restart-sentinel.js";
|
||||
import { cleanStaleGatewayProcessesSync } from "../../infra/restart-stale-pids.js";
|
||||
import { detectRespawnSupervisor } from "../../infra/supervisor-markers.js";
|
||||
import { setConsoleSubsystemFilter, setConsoleTimestampPrefix } from "../../logging/console.js";
|
||||
@@ -107,6 +110,8 @@ type Awaitable<T> = T | Promise<T>;
|
||||
* restart storm that can render low-resource hosts unresponsive.
|
||||
*/
|
||||
const EXIT_CONFIG_ERROR = 78;
|
||||
const CONFIG_AUTO_RECOVERY_MESSAGE =
|
||||
"Gateway recovered automatically after a failed config change and restored the last known good configuration.";
|
||||
|
||||
const GATEWAY_AUTH_MODES: readonly GatewayAuthMode[] = [
|
||||
"none",
|
||||
@@ -243,6 +248,54 @@ function getGatewayStartGuardErrors(params: {
|
||||
];
|
||||
}
|
||||
|
||||
async function readGatewayStartupConfig(params: {
|
||||
startupTrace: ReturnType<typeof createGatewayCliStartupTrace>;
|
||||
}): Promise<{ cfg: OpenClawConfig; snapshot: ConfigFileSnapshot | null }> {
|
||||
let cfg = await params.startupTrace.measure("cli.config-load", () => readBestEffortConfig());
|
||||
let snapshot: ConfigFileSnapshot | null = await params.startupTrace.measure(
|
||||
"cli.config-snapshot",
|
||||
() => readConfigFileSnapshot().catch(() => null),
|
||||
);
|
||||
if (snapshot?.exists && !snapshot.valid) {
|
||||
const invalidSnapshot = snapshot;
|
||||
const recovered = await params.startupTrace.measure("cli.config-recovery", () =>
|
||||
recoverConfigFromLastKnownGood({
|
||||
snapshot: invalidSnapshot,
|
||||
reason: "gateway-run-invalid-config",
|
||||
}),
|
||||
);
|
||||
if (recovered) {
|
||||
gatewayLog.warn(
|
||||
`gateway: restored invalid effective config from last-known-good backup: ${invalidSnapshot.path}`,
|
||||
);
|
||||
try {
|
||||
await writeRestartSentinel({
|
||||
kind: "config-auto-recovery",
|
||||
status: "ok",
|
||||
ts: Date.now(),
|
||||
message: CONFIG_AUTO_RECOVERY_MESSAGE,
|
||||
stats: {
|
||||
mode: "config-auto-recovery",
|
||||
reason: "gateway-run-invalid-config",
|
||||
after: { restoredFrom: "last-known-good" },
|
||||
},
|
||||
});
|
||||
} catch (err) {
|
||||
gatewayLog.warn(
|
||||
`gateway: failed to persist config auto-recovery notice: ${formatErrorMessage(err)}`,
|
||||
);
|
||||
}
|
||||
snapshot = await params.startupTrace.measure("cli.config-snapshot-reload", () =>
|
||||
readConfigFileSnapshot().catch(() => null),
|
||||
);
|
||||
}
|
||||
}
|
||||
if (snapshot?.valid) {
|
||||
cfg = snapshot.config;
|
||||
}
|
||||
return { cfg, snapshot };
|
||||
}
|
||||
|
||||
function resolveGatewayRunOptions(opts: GatewayRunOpts, command?: Command): GatewayRunOpts {
|
||||
const resolved: GatewayRunOpts = { ...opts };
|
||||
|
||||
@@ -338,7 +391,7 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
}
|
||||
|
||||
gatewayLog.info("loading configuration…");
|
||||
const cfg = await startupTrace.measure("cli.config-load", () => loadConfig());
|
||||
const { cfg, snapshot } = await readGatewayStartupConfig({ startupTrace });
|
||||
maybeLogPendingControlUiBuild(cfg);
|
||||
const portOverride = parsePort(opts.port);
|
||||
if (opts.port !== undefined && portOverride === null) {
|
||||
@@ -461,9 +514,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) {
|
||||
const tokenRaw = toOptionString(opts.token);
|
||||
|
||||
gatewayLog.info("resolving authentication…");
|
||||
const snapshot = await startupTrace.measure("cli.config-snapshot", () =>
|
||||
readConfigFileSnapshot().catch(() => null),
|
||||
);
|
||||
const configExists = snapshot?.exists ?? fs.existsSync(CONFIG_PATH);
|
||||
const configAuditPath = path.join(resolveStateDir(process.env), "logs", "config-audit.jsonl");
|
||||
const effectiveCfg = snapshot?.valid ? snapshot.config : cfg;
|
||||
|
||||
@@ -119,6 +119,12 @@ describe("ensureConfigReady", () => {
|
||||
const statusRuntime = await runEnsureConfigReady(["status"]);
|
||||
expect(statusRuntime.exit).not.toHaveBeenCalled();
|
||||
|
||||
const bareGatewayRuntime = await runEnsureConfigReady(["gateway"]);
|
||||
expect(bareGatewayRuntime.exit).not.toHaveBeenCalled();
|
||||
|
||||
const gatewayRunRuntime = await runEnsureConfigReady(["gateway", "run"]);
|
||||
expect(gatewayRunRuntime.exit).not.toHaveBeenCalled();
|
||||
|
||||
const gatewayRuntime = await runEnsureConfigReady(["gateway", "health"]);
|
||||
expect(gatewayRuntime.exit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
@@ -4,6 +4,7 @@ import { shouldMigrateStateFromPath } from "../argv.js";
|
||||
|
||||
const ALLOWED_INVALID_COMMANDS = new Set(["doctor", "logs", "health", "help", "status"]);
|
||||
const ALLOWED_INVALID_GATEWAY_SUBCOMMANDS = new Set([
|
||||
"run",
|
||||
"status",
|
||||
"probe",
|
||||
"health",
|
||||
@@ -73,9 +74,12 @@ export async function ensureConfigReady(params: {
|
||||
const snapshot = preflightSnapshot ?? (await getConfigSnapshot());
|
||||
const commandName = commandPath[0];
|
||||
const subcommandName = commandPath[1];
|
||||
const isBareGatewayForegroundRun =
|
||||
commandName === "gateway" && (subcommandName === undefined || subcommandName.trim() === "");
|
||||
const allowInvalid = commandName
|
||||
? params.allowInvalid === true ||
|
||||
ALLOWED_INVALID_COMMANDS.has(commandName) ||
|
||||
isBareGatewayForegroundRun ||
|
||||
(commandName === "gateway" &&
|
||||
subcommandName &&
|
||||
ALLOWED_INVALID_GATEWAY_SUBCOMMANDS.has(subcommandName))
|
||||
|
||||
@@ -90,6 +90,20 @@ describe("restart sentinel", () => {
|
||||
expect(formatRestartSentinelMessage(payload)).toBe("Config updated successfully");
|
||||
});
|
||||
|
||||
it("uses the exact auto-recovery message for config recovery notices", () => {
|
||||
const payload = {
|
||||
kind: "config-auto-recovery" as const,
|
||||
status: "ok" as const,
|
||||
ts: Date.now(),
|
||||
message:
|
||||
"Gateway recovered automatically after a failed config change and restored the last known good configuration.",
|
||||
stats: { mode: "config-auto-recovery", reason: "gateway-run-invalid-config" },
|
||||
};
|
||||
|
||||
expect(formatRestartSentinelMessage(payload)).toBe(payload.message);
|
||||
expect(summarizeRestartSentinel(payload)).toBe("Gateway auto-recovery");
|
||||
});
|
||||
|
||||
it("formatRestartSentinelMessage falls back to summary when no message", () => {
|
||||
const payload = {
|
||||
kind: "update" as const,
|
||||
|
||||
@@ -39,7 +39,7 @@ export type RestartSentinelContinuation =
|
||||
};
|
||||
|
||||
export type RestartSentinelPayload = {
|
||||
kind: "config-apply" | "config-patch" | "update" | "restart";
|
||||
kind: "config-apply" | "config-auto-recovery" | "config-patch" | "update" | "restart";
|
||||
status: "ok" | "error" | "skipped";
|
||||
ts: number;
|
||||
sessionKey?: string;
|
||||
@@ -130,7 +130,7 @@ export async function consumeRestartSentinel(
|
||||
|
||||
export function formatRestartSentinelMessage(payload: RestartSentinelPayload): string {
|
||||
const message = payload.message?.trim();
|
||||
if (message && !payload.stats) {
|
||||
if (message && (!payload.stats || payload.kind === "config-auto-recovery")) {
|
||||
return message;
|
||||
}
|
||||
const lines: string[] = [summarizeRestartSentinel(payload)];
|
||||
@@ -148,6 +148,9 @@ export function formatRestartSentinelMessage(payload: RestartSentinelPayload): s
|
||||
}
|
||||
|
||||
export function summarizeRestartSentinel(payload: RestartSentinelPayload): string {
|
||||
if (payload.kind === "config-auto-recovery") {
|
||||
return "Gateway auto-recovery";
|
||||
}
|
||||
const kind = payload.kind;
|
||||
const status = payload.status;
|
||||
const mode = payload.stats?.mode ? ` (${payload.stats.mode})` : "";
|
||||
|
||||
Reference in New Issue
Block a user