diff --git a/src/cli/gateway-cli/run.option-collisions.test.ts b/src/cli/gateway-cli/run.option-collisions.test.ts index f1cfe6850d3..61e4e5913b4 100644 --- a/src/cli/gateway-cli/run.option-collisions.test.ts +++ b/src/cli/gateway-cli/run.option-collisions.test.ts @@ -25,6 +25,12 @@ const configState = vi.hoisted(() => ({ cfg: {} as Record, snapshot: { exists: false } as Record, })); +const recoverConfigFromLastKnownGood = vi.fn<(params?: unknown) => Promise>( + async (_params?: unknown) => false, +); +const writeRestartSentinel = vi.fn<(payload?: unknown) => Promise>( + async (_payload?: unknown) => "/tmp/restart-sentinel.json", +); const controlUiState = vi.hoisted(() => ({ root: "/tmp/openclaw-control-ui" as string | null, })); @@ -33,10 +39,11 @@ const { runtimeErrors, defaultRuntime, resetRuntimeCapture } = createCliRuntimeC vi.mock("../../config/config.js", () => ({ getConfigPath: () => "/tmp/openclaw-test-missing-config.json", - loadConfig: () => configState.cfg, + readBestEffortConfig: async () => configState.cfg, readConfigFileSnapshot: async () => configState.snapshot, + recoverConfigFromLastKnownGood: (params: unknown) => recoverConfigFromLastKnownGood(params), resolveStateDir: () => "/tmp", - resolveGatewayPort: () => 18789, + resolveGatewayPort: (cfg?: { gateway?: { port?: number } }) => cfg?.gateway?.port ?? 18789, })); vi.mock("../../gateway/auth.js", () => ({ @@ -90,6 +97,10 @@ vi.mock("../../infra/ports.js", () => ({ inspectPortUsage: async () => ({ status: "free" }), })); +vi.mock("../../infra/restart-sentinel.js", () => ({ + writeRestartSentinel: (payload: unknown) => writeRestartSentinel(payload), +})); + vi.mock("../../logging/console.js", () => ({ setConsoleSubsystemFilter: (filters: string[]) => setConsoleSubsystemFilter(filters), setConsoleTimestampPrefix: () => undefined, @@ -100,7 +111,9 @@ vi.mock("../../logging/subsystem.js", () => ({ info: (message: string) => { gatewayLogMessages.push(message); }, - warn: () => undefined, + warn: (message: string) => { + gatewayLogMessages.push(message); + }, error: () => undefined, }), })); @@ -144,6 +157,10 @@ describe("gateway run option collisions", () => { configState.snapshot = { exists: false }; controlUiState.root = "/tmp/openclaw-control-ui"; gatewayLogMessages.length = 0; + recoverConfigFromLastKnownGood.mockReset(); + recoverConfigFromLastKnownGood.mockResolvedValue(false); + writeRestartSentinel.mockReset(); + writeRestartSentinel.mockResolvedValue("/tmp/restart-sentinel.json"); startGatewayServer.mockClear(); setGatewayWsLogStyle.mockClear(); setVerbose.mockClear(); @@ -258,6 +275,119 @@ describe("gateway run option collisions", () => { expect(startGatewayServer).not.toHaveBeenCalled(); }); + it("restores last-known-good config before startup when the effective config is invalid", async () => { + configState.cfg = {}; + configState.snapshot = { + exists: true, + valid: false, + path: "/tmp/openclaw-test-missing-config.json", + config: {}, + parsed: null, + issues: [{ path: "", message: "JSON5 parse failed" }], + legacyIssues: [], + }; + recoverConfigFromLastKnownGood.mockImplementationOnce(async () => { + configState.snapshot = { + exists: true, + valid: true, + path: "/tmp/openclaw-test-missing-config.json", + config: { + gateway: { + mode: "local", + port: 19170, + auth: { mode: "none" }, + }, + }, + parsed: { + gateway: { + mode: "local", + port: 19170, + auth: { mode: "none" }, + }, + }, + issues: [], + legacyIssues: [], + }; + return true; + }); + + await runGatewayCli(["gateway", "run", "--allow-unconfigured"]); + + expect(recoverConfigFromLastKnownGood).toHaveBeenCalledWith({ + snapshot: expect.objectContaining({ + exists: true, + valid: false, + }), + reason: "gateway-run-invalid-config", + }); + expect(writeRestartSentinel).toHaveBeenCalledWith({ + kind: "config-auto-recovery", + status: "ok", + ts: expect.any(Number), + message: + "Gateway recovered automatically after a failed config change and restored the last known good configuration.", + stats: { + mode: "config-auto-recovery", + reason: "gateway-run-invalid-config", + after: { restoredFrom: "last-known-good" }, + }, + }); + expect(gatewayLogMessages).toContain( + "gateway: restored invalid effective config from last-known-good backup: /tmp/openclaw-test-missing-config.json", + ); + expect(startGatewayServer).toHaveBeenCalledWith( + 19170, + expect.objectContaining({ + bind: "loopback", + auth: undefined, + }), + ); + }); + + it("keeps startup recovery non-fatal when writing the recovery notice fails", async () => { + configState.cfg = {}; + configState.snapshot = { + exists: true, + valid: false, + path: "/tmp/openclaw-test-missing-config.json", + config: {}, + parsed: null, + issues: [{ path: "", message: "JSON5 parse failed" }], + legacyIssues: [], + }; + recoverConfigFromLastKnownGood.mockImplementationOnce(async () => { + configState.snapshot = { + exists: true, + valid: true, + path: "/tmp/openclaw-test-missing-config.json", + config: { + gateway: { + mode: "local", + }, + }, + parsed: { + gateway: { + mode: "local", + }, + }, + issues: [], + legacyIssues: [], + }; + return true; + }); + writeRestartSentinel.mockRejectedValueOnce(new Error("disk full")); + + await runGatewayCli(["gateway", "run"]); + + expect(startGatewayServer).toHaveBeenCalledWith( + 18789, + expect.objectContaining({ bind: "loopback" }), + ); + expect(gatewayLogMessages).toContain( + "gateway: failed to persist config auto-recovery notice: disk full", + ); + }); + it.each(["none", "trusted-proxy"] as const)("accepts --auth %s override", async (mode) => { await runGatewayCli(["gateway", "run", "--auth", mode, "--allow-unconfigured"]); @@ -288,7 +418,12 @@ describe("gateway run option collisions", () => { }, }, }; - configState.snapshot = { exists: true, parsed: configState.cfg }; + configState.snapshot = { + exists: true, + valid: true, + config: configState.cfg, + parsed: configState.cfg, + }; await runGatewayCli(["gateway", "run", "--allow-unconfigured"]); diff --git a/src/cli/gateway-cli/run.ts b/src/cli/gateway-cli/run.ts index 7341fa66534..3801b829b5c 100644 --- a/src/cli/gateway-cli/run.ts +++ b/src/cli/gateway-cli/run.ts @@ -3,14 +3,16 @@ import path from "node:path"; import type { Command } from "commander"; import { readSecretFromFile } from "../../acp/secret-file.js"; import type { + ConfigFileSnapshot, GatewayAuthMode, GatewayBindMode, GatewayTailscaleMode, } from "../../config/config.js"; import { CONFIG_PATH, - loadConfig, + readBestEffortConfig, readConfigFileSnapshot, + recoverConfigFromLastKnownGood, resolveStateDir, resolveGatewayPort, } from "../../config/config.js"; @@ -26,6 +28,7 @@ import { isTruthyEnvValue } from "../../infra/env.js"; import { formatErrorMessage } from "../../infra/errors.js"; import { GatewayLockError } from "../../infra/gateway-lock.js"; import { formatPortDiagnostics, inspectPortUsage } from "../../infra/ports.js"; +import { writeRestartSentinel } from "../../infra/restart-sentinel.js"; import { cleanStaleGatewayProcessesSync } from "../../infra/restart-stale-pids.js"; import { detectRespawnSupervisor } from "../../infra/supervisor-markers.js"; import { setConsoleSubsystemFilter, setConsoleTimestampPrefix } from "../../logging/console.js"; @@ -107,6 +110,8 @@ type Awaitable = T | Promise; * restart storm that can render low-resource hosts unresponsive. */ const EXIT_CONFIG_ERROR = 78; +const CONFIG_AUTO_RECOVERY_MESSAGE = + "Gateway recovered automatically after a failed config change and restored the last known good configuration."; const GATEWAY_AUTH_MODES: readonly GatewayAuthMode[] = [ "none", @@ -243,6 +248,54 @@ function getGatewayStartGuardErrors(params: { ]; } +async function readGatewayStartupConfig(params: { + startupTrace: ReturnType; +}): Promise<{ cfg: OpenClawConfig; snapshot: ConfigFileSnapshot | null }> { + let cfg = await params.startupTrace.measure("cli.config-load", () => readBestEffortConfig()); + let snapshot: ConfigFileSnapshot | null = await params.startupTrace.measure( + "cli.config-snapshot", + () => readConfigFileSnapshot().catch(() => null), + ); + if (snapshot?.exists && !snapshot.valid) { + const invalidSnapshot = snapshot; + const recovered = await params.startupTrace.measure("cli.config-recovery", () => + recoverConfigFromLastKnownGood({ + snapshot: invalidSnapshot, + reason: "gateway-run-invalid-config", + }), + ); + if (recovered) { + gatewayLog.warn( + `gateway: restored invalid effective config from last-known-good backup: ${invalidSnapshot.path}`, + ); + try { + await writeRestartSentinel({ + kind: "config-auto-recovery", + status: "ok", + ts: Date.now(), + message: CONFIG_AUTO_RECOVERY_MESSAGE, + stats: { + mode: "config-auto-recovery", + reason: "gateway-run-invalid-config", + after: { restoredFrom: "last-known-good" }, + }, + }); + } catch (err) { + gatewayLog.warn( + `gateway: failed to persist config auto-recovery notice: ${formatErrorMessage(err)}`, + ); + } + snapshot = await params.startupTrace.measure("cli.config-snapshot-reload", () => + readConfigFileSnapshot().catch(() => null), + ); + } + } + if (snapshot?.valid) { + cfg = snapshot.config; + } + return { cfg, snapshot }; +} + function resolveGatewayRunOptions(opts: GatewayRunOpts, command?: Command): GatewayRunOpts { const resolved: GatewayRunOpts = { ...opts }; @@ -338,7 +391,7 @@ async function runGatewayCommand(opts: GatewayRunOpts) { } gatewayLog.info("loading configuration…"); - const cfg = await startupTrace.measure("cli.config-load", () => loadConfig()); + const { cfg, snapshot } = await readGatewayStartupConfig({ startupTrace }); maybeLogPendingControlUiBuild(cfg); const portOverride = parsePort(opts.port); if (opts.port !== undefined && portOverride === null) { @@ -461,9 +514,6 @@ async function runGatewayCommand(opts: GatewayRunOpts) { const tokenRaw = toOptionString(opts.token); gatewayLog.info("resolving authentication…"); - const snapshot = await startupTrace.measure("cli.config-snapshot", () => - readConfigFileSnapshot().catch(() => null), - ); const configExists = snapshot?.exists ?? fs.existsSync(CONFIG_PATH); const configAuditPath = path.join(resolveStateDir(process.env), "logs", "config-audit.jsonl"); const effectiveCfg = snapshot?.valid ? snapshot.config : cfg; diff --git a/src/cli/program/config-guard.test.ts b/src/cli/program/config-guard.test.ts index 3838381ad1e..aa0b92cbf74 100644 --- a/src/cli/program/config-guard.test.ts +++ b/src/cli/program/config-guard.test.ts @@ -119,6 +119,12 @@ describe("ensureConfigReady", () => { const statusRuntime = await runEnsureConfigReady(["status"]); expect(statusRuntime.exit).not.toHaveBeenCalled(); + const bareGatewayRuntime = await runEnsureConfigReady(["gateway"]); + expect(bareGatewayRuntime.exit).not.toHaveBeenCalled(); + + const gatewayRunRuntime = await runEnsureConfigReady(["gateway", "run"]); + expect(gatewayRunRuntime.exit).not.toHaveBeenCalled(); + const gatewayRuntime = await runEnsureConfigReady(["gateway", "health"]); expect(gatewayRuntime.exit).not.toHaveBeenCalled(); }); diff --git a/src/cli/program/config-guard.ts b/src/cli/program/config-guard.ts index ee79d308f51..87451e58cef 100644 --- a/src/cli/program/config-guard.ts +++ b/src/cli/program/config-guard.ts @@ -4,6 +4,7 @@ import { shouldMigrateStateFromPath } from "../argv.js"; const ALLOWED_INVALID_COMMANDS = new Set(["doctor", "logs", "health", "help", "status"]); const ALLOWED_INVALID_GATEWAY_SUBCOMMANDS = new Set([ + "run", "status", "probe", "health", @@ -73,9 +74,12 @@ export async function ensureConfigReady(params: { const snapshot = preflightSnapshot ?? (await getConfigSnapshot()); const commandName = commandPath[0]; const subcommandName = commandPath[1]; + const isBareGatewayForegroundRun = + commandName === "gateway" && (subcommandName === undefined || subcommandName.trim() === ""); const allowInvalid = commandName ? params.allowInvalid === true || ALLOWED_INVALID_COMMANDS.has(commandName) || + isBareGatewayForegroundRun || (commandName === "gateway" && subcommandName && ALLOWED_INVALID_GATEWAY_SUBCOMMANDS.has(subcommandName)) diff --git a/src/infra/restart-sentinel.test.ts b/src/infra/restart-sentinel.test.ts index aca1d7e6668..ee7f3915ef8 100644 --- a/src/infra/restart-sentinel.test.ts +++ b/src/infra/restart-sentinel.test.ts @@ -90,6 +90,20 @@ describe("restart sentinel", () => { expect(formatRestartSentinelMessage(payload)).toBe("Config updated successfully"); }); + it("uses the exact auto-recovery message for config recovery notices", () => { + const payload = { + kind: "config-auto-recovery" as const, + status: "ok" as const, + ts: Date.now(), + message: + "Gateway recovered automatically after a failed config change and restored the last known good configuration.", + stats: { mode: "config-auto-recovery", reason: "gateway-run-invalid-config" }, + }; + + expect(formatRestartSentinelMessage(payload)).toBe(payload.message); + expect(summarizeRestartSentinel(payload)).toBe("Gateway auto-recovery"); + }); + it("formatRestartSentinelMessage falls back to summary when no message", () => { const payload = { kind: "update" as const, diff --git a/src/infra/restart-sentinel.ts b/src/infra/restart-sentinel.ts index 4fa816e016e..3faef4cada5 100644 --- a/src/infra/restart-sentinel.ts +++ b/src/infra/restart-sentinel.ts @@ -39,7 +39,7 @@ export type RestartSentinelContinuation = }; export type RestartSentinelPayload = { - kind: "config-apply" | "config-patch" | "update" | "restart"; + kind: "config-apply" | "config-auto-recovery" | "config-patch" | "update" | "restart"; status: "ok" | "error" | "skipped"; ts: number; sessionKey?: string; @@ -130,7 +130,7 @@ export async function consumeRestartSentinel( export function formatRestartSentinelMessage(payload: RestartSentinelPayload): string { const message = payload.message?.trim(); - if (message && !payload.stats) { + if (message && (!payload.stats || payload.kind === "config-auto-recovery")) { return message; } const lines: string[] = [summarizeRestartSentinel(payload)]; @@ -148,6 +148,9 @@ export function formatRestartSentinelMessage(payload: RestartSentinelPayload): s } export function summarizeRestartSentinel(payload: RestartSentinelPayload): string { + if (payload.kind === "config-auto-recovery") { + return "Gateway auto-recovery"; + } const kind = payload.kind; const status = payload.status; const mode = payload.stats?.mode ? ` (${payload.stats.mode})` : "";