From ffb1628727fd5ffcca39f341daabc387010f0d00 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 20 Apr 2026 13:16:07 +0100 Subject: [PATCH] fix: recover invalid gateway configs --- docs/.i18n/glossary.zh-CN.json | 12 + docs/cli/config.md | 28 +++ docs/gateway/configuration.md | 24 ++ docs/gateway/troubleshooting.md | 57 +++++ docs/help/faq.md | 16 +- src/config/config.ts | 2 + src/config/io.audit.ts | 4 +- src/config/io.observe-recovery.test.ts | 85 +++++++ src/config/io.observe-recovery.ts | 173 +++++++++++++ src/config/io.ts | 73 ++++++ src/config/io.write-config.test.ts | 32 +++ src/gateway/config-recovery-notice.test.ts | 44 ++++ src/gateway/config-recovery-notice.ts | 31 +++ src/gateway/config-reload.test.ts | 228 +++++++++++++++++- src/gateway/config-reload.ts | 83 ++++++- src/gateway/server-reload-handlers.ts | 14 ++ .../server-startup-config.recovery.test.ts | 107 ++++++++ src/gateway/server-startup-config.ts | 22 ++ src/gateway/server.impl.ts | 9 + 19 files changed, 1023 insertions(+), 21 deletions(-) create mode 100644 src/gateway/config-recovery-notice.test.ts create mode 100644 src/gateway/config-recovery-notice.ts create mode 100644 src/gateway/server-startup-config.recovery.test.ts diff --git a/docs/.i18n/glossary.zh-CN.json b/docs/.i18n/glossary.zh-CN.json index abcb56f2c57..1d1989f53fc 100644 --- a/docs/.i18n/glossary.zh-CN.json +++ b/docs/.i18n/glossary.zh-CN.json @@ -374,5 +374,17 @@ { "source": "Testing", "target": "测试" + }, + { + "source": "/gateway/configuration#strict-validation", + "target": "/gateway/configuration#strict-validation" + }, + { + "source": "/gateway/configuration#config-hot-reload", + "target": "/gateway/configuration#config-hot-reload" + }, + { + "source": "/cli/config", + "target": "/cli/config" } ] diff --git a/docs/cli/config.md b/docs/cli/config.md index 6bb8f081c65..df2ebe89c3d 100644 --- a/docs/cli/config.md +++ b/docs/cli/config.md @@ -336,6 +336,34 @@ If dry-run fails: - `Dry run note: skipped exec SecretRef resolvability check(s)`: dry-run skipped exec refs; rerun with `--allow-exec` if you need exec resolvability validation. - For batch mode, fix failing entries and rerun `--dry-run` before writing. +## Write safety + +`openclaw config set` and other OpenClaw-owned config writers validate the full +post-change config before committing it to disk. If the new payload fails schema +validation or looks like a destructive clobber, the active config is left alone +and the rejected payload is saved beside it as `openclaw.json.rejected.*`. + +Prefer CLI writes for small edits: + +```bash +openclaw config set gateway.reload.mode hybrid --dry-run +openclaw config set gateway.reload.mode hybrid +openclaw config validate +``` + +If a write is rejected, inspect the saved payload and fix the full config shape: + +```bash +CONFIG="$(openclaw config file)" +ls -lt "$CONFIG".rejected.* 2>/dev/null | head +openclaw config validate +``` + +Direct editor writes are still allowed, but the running Gateway treats them as +untrusted until they validate. Invalid direct edits can be restored from the +last-known-good backup during startup or hot reload. See +[Gateway troubleshooting](/gateway/troubleshooting#gateway-restored-last-known-good-config). + ## Subcommands - `config file`: Print the active config file path (resolved from `OPENCLAW_CONFIG_PATH` or default location). diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 9f0ea090996..7c88a628542 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -96,6 +96,17 @@ When validation fails: - Run `openclaw doctor` to see exact issues - Run `openclaw doctor --fix` (or `--yes`) to apply repairs +The Gateway also keeps a trusted last-known-good copy after a successful startup. If +`openclaw.json` is later changed outside OpenClaw and no longer validates, startup +and hot reload preserve the broken file as a timestamped `.clobbered.*` snapshot, +restore the last-known-good copy, and log a loud warning with the recovery reason. +The next main-agent turn also receives a system-event warning telling it that the +config was restored and must not be blindly rewritten. Last-known-good promotion +is updated after validated startup and after accepted hot reloads, including +OpenClaw-owned config writes whose persisted file hash still matches the accepted +write. Promotion is skipped when the candidate contains redacted secret +placeholders such as `***` or shortened token values. + ## Common tasks @@ -494,6 +505,19 @@ When validation fails: The Gateway watches `~/.openclaw/openclaw.json` and applies changes automatically — no manual restart needed for most settings. +Direct file edits are treated as untrusted until they validate. The watcher waits +for editor temp-write/rename churn to settle, reads the final file, and rejects +invalid external edits by restoring the last-known-good config. OpenClaw-owned +config writes use the same schema gate before writing; destructive clobbers such +as dropping `gateway.mode` or shrinking the file by more than half are rejected +and saved as `.rejected.*` for inspection. + +If you see `Config auto-restored from last-known-good` or +`config reload restored last-known-good config` in logs, inspect the matching +`.clobbered.*` file next to `openclaw.json`, fix the rejected payload, then run +`openclaw config validate`. See [Gateway troubleshooting](/gateway/troubleshooting#gateway-restored-last-known-good-config) +for the recovery checklist. + ### Reload modes | Mode | Behavior | diff --git a/docs/gateway/troubleshooting.md b/docs/gateway/troubleshooting.md index c4c16fe5726..8819aba67d8 100644 --- a/docs/gateway/troubleshooting.md +++ b/docs/gateway/troubleshooting.md @@ -262,6 +262,63 @@ Related: - [/gateway/configuration](/gateway/configuration) - [/gateway/doctor](/gateway/doctor) +## Gateway restored last-known-good config + +Use this when the Gateway starts, but logs say it restored `openclaw.json`. + +```bash +openclaw logs --follow +openclaw config file +openclaw config validate +openclaw doctor +``` + +Look for: + +- `Config auto-restored from last-known-good` +- `gateway: invalid config was restored from last-known-good backup` +- `config reload restored last-known-good config after invalid-config` +- A timestamped `openclaw.json.clobbered.*` file beside the active config +- A main-agent system event that starts with `Config recovery warning` + +What happened: + +- The rejected config did not validate during startup or hot reload. +- OpenClaw preserved the rejected payload as `.clobbered.*`. +- The active config was restored from the last validated last-known-good copy. +- The next main-agent turn is warned not to blindly rewrite the rejected config. + +Inspect and repair: + +```bash +CONFIG="$(openclaw config file)" +ls -lt "$CONFIG".clobbered.* "$CONFIG".rejected.* 2>/dev/null | head +diff -u "$CONFIG" "$(ls -t "$CONFIG".clobbered.* 2>/dev/null | head -n 1)" +openclaw config validate +openclaw doctor +``` + +Common signatures: + +- `.clobbered.*` exists → an external direct edit or startup read was restored. +- `.rejected.*` exists → an OpenClaw-owned config write failed schema or clobber checks before commit. +- `Config write rejected:` → the write tried to drop required shape, shrink the file sharply, or persist invalid config. +- `Config last-known-good promotion skipped` → the candidate contained redacted secret placeholders such as `***`. + +Fix options: + +1. Keep the restored active config if it is correct. +2. Copy only the intended keys from `.clobbered.*` or `.rejected.*`, then apply them with `openclaw config set` or `config.patch`. +3. Run `openclaw config validate` before restarting. +4. If you edit by hand, keep the full JSON5 config, not just the partial object you wanted to change. + +Related: + +- [/gateway/configuration#strict-validation](/gateway/configuration#strict-validation) +- [/gateway/configuration#config-hot-reload](/gateway/configuration#config-hot-reload) +- [/cli/config](/cli/config) +- [/gateway/doctor](/gateway/doctor) + ## Gateway probe warnings Use this when `openclaw gateway probe` reaches something, but still prints a warning block. diff --git a/docs/help/faq.md b/docs/help/faq.md index f40e55b3429..6101e6c07db 100644 --- a/docs/help/faq.md +++ b/docs/help/faq.md @@ -1629,10 +1629,20 @@ for usage/billing and raise limits as needed. `config.apply` replaces the **entire config**. If you send a partial object, everything else is removed. + Current OpenClaw protects many accidental clobbers: + + - OpenClaw-owned config writes validate the full post-change config before writing. + - Invalid or destructive OpenClaw-owned writes are rejected and saved as `openclaw.json.rejected.*`. + - If a direct edit breaks startup or hot reload, the Gateway restores the last-known-good config and saves the rejected file as `openclaw.json.clobbered.*`. + - The main agent receives a boot warning after recovery so it does not blindly write the bad config again. + Recover: - - Restore from backup (git or a copied `~/.openclaw/openclaw.json`). - - If you have no backup, re-run `openclaw doctor` and reconfigure channels/models. + - Check `openclaw logs --follow` for `Config auto-restored from last-known-good`, `Config write rejected:`, or `config reload restored last-known-good config`. + - Inspect the newest `openclaw.json.clobbered.*` or `openclaw.json.rejected.*` beside the active config. + - Keep the active restored config if it works, then copy only the intended keys back with `openclaw config set` or `config.patch`. + - Run `openclaw config validate` and `openclaw doctor`. + - If you have no last-known-good or rejected payload, restore from backup, or re-run `openclaw doctor` and reconfigure channels/models. - If this was unexpected, file a bug and include your last known config or any backup. - A local coding agent can often reconstruct a working config from logs or history. @@ -1644,7 +1654,7 @@ for usage/billing and raise limits as needed. - Use `config.patch` for partial RPC edits; keep `config.apply` for full-config replacement only. - If you are using the owner-only `gateway` tool from an agent run, it will still reject writes to `tools.exec.ask` / `tools.exec.security` (including legacy `tools.bash.*` aliases that normalize to the same protected exec paths). - Docs: [Config](/cli/config), [Configure](/cli/configure), [Doctor](/gateway/doctor). + Docs: [Config](/cli/config), [Configure](/cli/configure), [Gateway troubleshooting](/gateway/troubleshooting#gateway-restored-last-known-good-config), [Doctor](/gateway/doctor). diff --git a/src/config/config.ts b/src/config/config.ts index f2eee8fc96d..45ebd5d56b7 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -12,10 +12,12 @@ export { readBestEffortConfig, readSourceConfigBestEffort, parseConfigJson5, + promoteConfigSnapshotToLastKnownGood, readConfigFileSnapshot, readConfigFileSnapshotForWrite, readSourceConfigSnapshot, readSourceConfigSnapshotForWrite, + recoverConfigFromLastKnownGood, resetConfigRuntimeState, resolveConfigSnapshotHash, setRuntimeConfigSnapshotRefreshHandler, diff --git a/src/config/io.audit.ts b/src/config/io.audit.ts index db40375d381..9bef1969fc4 100644 --- a/src/config/io.audit.ts +++ b/src/config/io.audit.ts @@ -3,7 +3,7 @@ import { resolveStateDir } from "./paths.js"; const CONFIG_AUDIT_LOG_FILENAME = "config-audit.jsonl"; -export type ConfigWriteAuditResult = "rename" | "copy-fallback" | "failed"; +export type ConfigWriteAuditResult = "rename" | "copy-fallback" | "failed" | "rejected"; export type ConfigWriteAuditRecord = { ts: string; @@ -269,7 +269,7 @@ export function finalizeConfigWriteAuditRecord(params: { uid: null, gid: null, }; - const success = params.result !== "failed"; + const success = params.result !== "failed" && params.result !== "rejected"; return { ...params.base, result: params.result, diff --git a/src/config/io.observe-recovery.test.ts b/src/config/io.observe-recovery.test.ts index 623390ed22f..cd74942d19c 100644 --- a/src/config/io.observe-recovery.test.ts +++ b/src/config/io.observe-recovery.test.ts @@ -7,8 +7,12 @@ import { afterAll, beforeAll, describe, expect, it, vi } from "vitest"; import { maybeRecoverSuspiciousConfigRead, maybeRecoverSuspiciousConfigReadSync, + promoteConfigSnapshotToLastKnownGood, + recoverConfigFromLastKnownGood, + resolveLastKnownGoodConfigPath, type ObserveRecoveryDeps, } from "./io.observe-recovery.js"; +import type { ConfigFileSnapshot } from "./types.js"; describe("config observe recovery", () => { let fixtureRoot = ""; @@ -33,6 +37,26 @@ describe("config observe recovery", () => { await fsp.writeFile(configPath, `${JSON.stringify(config, null, 2)}\n`, "utf-8"); } + async function makeSnapshot(configPath: string, config: Record) { + const raw = `${JSON.stringify(config, null, 2)}\n`; + await fsp.mkdir(path.dirname(configPath), { recursive: true }); + await fsp.writeFile(configPath, raw, "utf-8"); + return { + path: configPath, + exists: true, + raw, + parsed: config, + sourceConfig: config, + resolved: config, + valid: true, + runtimeConfig: config, + config, + issues: [], + warnings: [], + legacyIssues: [], + } satisfies ConfigFileSnapshot; + } + function makeDeps( home: string, warn = vi.fn(), @@ -158,4 +182,65 @@ describe("config observe recovery", () => { expect(observe?.lastKnownGoodIno ?? null).toBeNull(); }); }); + + it("promotes a valid startup config and restores it after an invalid direct edit", async () => { + await withSuiteHome(async (home) => { + const { deps, configPath, auditPath, warn } = makeDeps(home); + const snapshot = await makeSnapshot(configPath, { + gateway: { mode: "local", auth: { mode: "token", token: "secret-token" } }, + channels: { discord: { enabled: true, dmPolicy: "pairing" } }, + }); + + await expect( + promoteConfigSnapshotToLastKnownGood({ deps, snapshot, logger: deps.logger }), + ).resolves.toBe(true); + await expect(fsp.readFile(resolveLastKnownGoodConfigPath(configPath), "utf-8")).resolves.toBe( + snapshot.raw, + ); + + const brokenRaw = "{ gateway: { mode: 123 } }\n"; + await fsp.writeFile(configPath, brokenRaw, "utf-8"); + const restored = await recoverConfigFromLastKnownGood({ + deps, + snapshot: { + ...snapshot, + raw: brokenRaw, + parsed: { gateway: { mode: 123 } }, + valid: false, + issues: [{ path: "gateway.mode", message: "Expected string" }], + }, + reason: "test-invalid-config", + }); + + expect(restored).toBe(true); + await expect(fsp.readFile(configPath, "utf-8")).resolves.toBe(snapshot.raw); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining("Config auto-restored from last-known-good:"), + ); + const lines = (await fsp.readFile(auditPath, "utf-8")).trim().split("\n").filter(Boolean); + const observe = lines + .map((line) => JSON.parse(line) as Record) + .findLast((line) => line.event === "config.observe"); + expect(observe?.restoredFromBackup).toBe(true); + expect(observe?.restoredBackupPath).toBe(resolveLastKnownGoodConfigPath(configPath)); + }); + }); + + it("refuses to promote redacted secret placeholders", async () => { + await withSuiteHome(async (home) => { + const warn = vi.fn(); + const { deps, configPath } = makeDeps(home, warn); + const snapshot = await makeSnapshot(configPath, { + gateway: { mode: "local", auth: { mode: "token", token: "***" } }, + }); + + await expect( + promoteConfigSnapshotToLastKnownGood({ deps, snapshot, logger: deps.logger }), + ).resolves.toBe(false); + await expect(fsp.stat(resolveLastKnownGoodConfigPath(configPath))).rejects.toThrow(); + expect(warn).toHaveBeenCalledWith( + expect.stringContaining("Config last-known-good promotion skipped"), + ); + }); + }); }); diff --git a/src/config/io.observe-recovery.ts b/src/config/io.observe-recovery.ts index c27488006dc..5f13a47bd51 100644 --- a/src/config/io.observe-recovery.ts +++ b/src/config/io.observe-recovery.ts @@ -7,6 +7,7 @@ import { type ConfigObserveAuditRecord, } from "./io.audit.js"; import { resolveStateDir } from "./paths.js"; +import type { ConfigFileSnapshot } from "./types.openclaw.js"; export type ObserveRecoveryDeps = { fs: { @@ -28,6 +29,7 @@ export type ObserveRecoveryDeps = { options?: { encoding?: BufferEncoding; mode?: number; flag?: string }, ): Promise; copyFile(src: string, dest: string): Promise; + chmod?(path: string, mode: number): Promise; mkdir(path: string, options?: { recursive?: boolean; mode?: number }): Promise; appendFile( path: string, @@ -55,6 +57,7 @@ export type ObserveRecoveryDeps = { options?: { encoding?: BufferEncoding; mode?: number; flag?: string }, ): unknown; copyFileSync(src: string, dest: string): unknown; + chmodSync?(path: string, mode: number): unknown; mkdirSync(path: string, options?: { recursive?: boolean; mode?: number }): unknown; appendFileSync( path: string, @@ -109,6 +112,7 @@ type ConfigStatMetadataSource = type ConfigHealthEntry = { lastKnownGood?: ConfigHealthFingerprint; + lastPromotedGood?: ConfigHealthFingerprint; lastObservedSuspiciousSignature?: string | null; }; @@ -506,6 +510,47 @@ function formatConfigArtifactTimestamp(ts: string): string { return ts.replaceAll(":", "-").replaceAll(".", "-"); } +export function resolveLastKnownGoodConfigPath(configPath: string): string { + return `${configPath}.last-good`; +} + +function isSensitiveConfigPath(pathLabel: string): boolean { + return /(^|\.)(api[-_]?key|auth|bearer|credential|password|private[-_]?key|secret|token)(\.|$)/i.test( + pathLabel, + ); +} + +function collectPollutedSecretPlaceholders( + value: unknown, + pathLabel = "", + output: string[] = [], +): string[] { + if (typeof value === "string") { + const trimmed = value.trim(); + if (trimmed === "***" || trimmed === "[redacted]") { + output.push(pathLabel || ""); + return output; + } + if (isSensitiveConfigPath(pathLabel) && (trimmed.includes("...") || trimmed.includes("…"))) { + output.push(pathLabel || ""); + } + return output; + } + if (Array.isArray(value)) { + value.forEach((item, index) => + collectPollutedSecretPlaceholders(item, `${pathLabel}[${index}]`, output), + ); + return output; + } + if (isRecord(value)) { + for (const [key, child] of Object.entries(value)) { + const childPath = pathLabel ? `${pathLabel}.${key}` : key; + collectPollutedSecretPlaceholders(child, childPath, output); + } + } + return output; +} + async function persistClobberedConfigSnapshot(params: { deps: ObserveRecoveryDeps; configPath: string; @@ -760,6 +805,7 @@ export async function observeConfigSnapshot( if (suspicious.length === 0) { if (snapshot.valid) { const nextEntry: ConfigHealthEntry = { + ...entry, lastKnownGood: current, lastObservedSuspiciousSignature: null, }; @@ -858,6 +904,7 @@ export function observeConfigSnapshotSync( if (suspicious.length === 0) { if (snapshot.valid) { healthState = setConfigHealthEntry(healthState, snapshot.path, { + ...entry, lastKnownGood: current, lastObservedSuspiciousSignature: null, }); @@ -902,3 +949,129 @@ export function observeConfigSnapshotSync( ); writeConfigHealthStateSync(deps, healthState); } + +export async function promoteConfigSnapshotToLastKnownGood(params: { + deps: ObserveRecoveryDeps; + snapshot: ConfigFileSnapshot; + logger?: Pick; +}): Promise { + const { deps, snapshot } = params; + if (!snapshot.exists || !snapshot.valid || typeof snapshot.raw !== "string") { + return false; + } + const polluted = collectPollutedSecretPlaceholders(snapshot.parsed); + if (polluted.length > 0) { + params.logger?.warn( + `Config last-known-good promotion skipped: redacted secret placeholder at ${polluted[0]}`, + ); + return false; + } + const stat = await deps.fs.promises.stat(snapshot.path).catch(() => null); + const now = new Date().toISOString(); + const current = createConfigHealthFingerprint({ + hash: resolveConfigSnapshotHash(snapshot) ?? hashConfigRaw(snapshot.raw), + raw: snapshot.raw, + parsed: snapshot.parsed, + gatewaySource: snapshot.resolved, + stat: stat as ConfigStatMetadataSource, + observedAt: now, + }); + const lastGoodPath = resolveLastKnownGoodConfigPath(snapshot.path); + await deps.fs.promises.writeFile(lastGoodPath, snapshot.raw, { + encoding: "utf-8", + mode: 0o600, + }); + await deps.fs.promises.chmod?.(lastGoodPath, 0o600).catch(() => {}); + const healthState = await readConfigHealthState(deps); + const entry = getConfigHealthEntry(healthState, snapshot.path); + await writeConfigHealthState( + deps, + setConfigHealthEntry(healthState, snapshot.path, { + ...entry, + lastKnownGood: current, + lastPromotedGood: current, + lastObservedSuspiciousSignature: null, + }), + ); + return true; +} + +export async function recoverConfigFromLastKnownGood(params: { + deps: ObserveRecoveryDeps; + snapshot: ConfigFileSnapshot; + reason: string; +}): Promise { + const { deps, snapshot } = params; + if (!snapshot.exists || typeof snapshot.raw !== "string") { + return false; + } + const healthState = await readConfigHealthState(deps); + const entry = getConfigHealthEntry(healthState, snapshot.path); + const promoted = entry.lastPromotedGood; + if (!promoted?.hash) { + return false; + } + const lastGoodPath = resolveLastKnownGoodConfigPath(snapshot.path); + const backupRaw = await deps.fs.promises.readFile(lastGoodPath, "utf-8").catch(() => null); + if (!backupRaw || hashConfigRaw(backupRaw) !== promoted.hash) { + return false; + } + let backupParsed: unknown; + try { + backupParsed = deps.json5.parse(backupRaw); + } catch { + return false; + } + const polluted = collectPollutedSecretPlaceholders(backupParsed); + if (polluted.length > 0) { + deps.logger.warn( + `Config last-known-good recovery skipped: redacted secret placeholder at ${polluted[0]}`, + ); + return false; + } + const now = new Date().toISOString(); + const stat = await deps.fs.promises.stat(snapshot.path).catch(() => null); + const current = createConfigHealthFingerprint({ + hash: resolveConfigSnapshotHash(snapshot) ?? hashConfigRaw(snapshot.raw), + raw: snapshot.raw, + parsed: snapshot.parsed, + gatewaySource: snapshot.resolved, + stat: stat as ConfigStatMetadataSource, + observedAt: now, + }); + const clobberedPath = await persistClobberedConfigSnapshot({ + deps, + configPath: snapshot.path, + raw: snapshot.raw, + observedAt: now, + }); + await deps.fs.promises.copyFile(lastGoodPath, snapshot.path); + await deps.fs.promises.chmod?.(snapshot.path, 0o600).catch(() => {}); + deps.logger.warn( + `Config auto-restored from last-known-good: ${snapshot.path} (${params.reason})`, + ); + await appendConfigAuditRecord( + createConfigObserveAuditAppendParams(deps, { + ts: now, + configPath: snapshot.path, + valid: snapshot.valid, + current, + suspicious: [params.reason], + lastKnownGood: promoted, + backup: promoted, + clobberedPath, + restoredFromBackup: true, + restoredBackupPath: lastGoodPath, + }), + ); + await writeConfigHealthState( + deps, + setConfigHealthEntry(healthState, snapshot.path, { + ...entry, + lastKnownGood: promoted, + lastPromotedGood: promoted, + lastObservedSuspiciousSignature: null, + }), + ); + return true; +} diff --git a/src/config/io.ts b/src/config/io.ts index 3a0d8f583d2..9c8647dff04 100644 --- a/src/config/io.ts +++ b/src/config/io.ts @@ -48,6 +48,8 @@ import { throwInvalidConfig } from "./io.invalid-config.js"; import { maybeRecoverSuspiciousConfigRead, maybeRecoverSuspiciousConfigReadSync, + promoteConfigSnapshotToLastKnownGood as promoteConfigSnapshotToLastKnownGoodWithDeps, + recoverConfigFromLastKnownGood as recoverConfigFromLastKnownGoodWithDeps, } from "./io.observe-recovery.js"; import { persistGeneratedOwnerDisplaySecret } from "./io.owner-display-secret.js"; import { @@ -126,6 +128,7 @@ type ConfigHealthFingerprint = { type ConfigHealthEntry = { lastKnownGood?: ConfigHealthFingerprint; + lastPromotedGood?: ConfigHealthFingerprint; lastObservedSuspiciousSignature?: string | null; }; @@ -160,6 +163,11 @@ export type ConfigWriteOptions = { * the post-write runtime snapshot refresh/reload tail entirely. */ skipRuntimeSnapshotRefresh?: boolean; + /** + * Allow intentionally destructive config writes, such as explicit reset flows. + * Normal writers must keep this false so clobbers are rejected before disk commit. + */ + allowDestructiveWrite?: boolean; }; export type ReadConfigFileSnapshotForWriteResult = { @@ -333,6 +341,12 @@ function resolveConfigWriteSuspiciousReasons(params: { return reasons; } +function resolveConfigWriteBlockingReasons(suspicious: string[]): string[] { + return suspicious.filter( + (reason) => reason.startsWith("size-drop:") || reason === "gateway-mode-removed", + ); +} + async function readConfigHealthState(deps: Required): Promise { try { const healthPath = resolveConfigHealthStatePath(deps.env, deps.homedir); @@ -601,6 +615,7 @@ async function observeConfigSnapshot( if (suspicious.length === 0) { if (snapshot.valid) { const nextEntry: ConfigHealthEntry = { + ...entry, lastKnownGood: current, lastObservedSuspiciousSignature: null, }; @@ -734,6 +749,7 @@ function observeConfigSnapshotSync( if (suspicious.length === 0) { if (snapshot.valid) { const nextEntry: ConfigHealthEntry = { + ...entry, lastKnownGood: current, lastObservedSuspiciousSignature: null, }; @@ -1395,6 +1411,27 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { return result.snapshot; } + async function promoteConfigSnapshotToLastKnownGood( + snapshot: ConfigFileSnapshot, + ): Promise { + return await promoteConfigSnapshotToLastKnownGoodWithDeps({ + deps, + snapshot, + logger: deps.logger, + }); + } + + async function recoverConfigFromLastKnownGood(params: { + snapshot: ConfigFileSnapshot; + reason: string; + }): Promise { + return await recoverConfigFromLastKnownGoodWithDeps({ + deps, + snapshot: params.snapshot, + reason: params.reason, + }); + } + async function readConfigFileSnapshotForWrite(): Promise { const result = await readConfigFileSnapshotInternal(); return { @@ -1656,6 +1693,26 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { }), }); }; + const blockingReasons = resolveConfigWriteBlockingReasons(suspiciousReasons); + if (blockingReasons.length > 0 && options.allowDestructiveWrite !== true) { + const rejectedPath = `${configPath}.rejected.${formatConfigArtifactTimestamp(new Date().toISOString())}`; + await deps.fs.promises + .writeFile(rejectedPath, json, { + encoding: "utf-8", + mode: 0o600, + flag: "wx", + }) + .catch(() => {}); + const message = `Config write rejected: ${configPath} (${blockingReasons.join(", ")}). Rejected payload saved to ${rejectedPath}.`; + const err = Object.assign(new Error(message), { + code: "CONFIG_WRITE_REJECTED", + rejectedPath, + reasons: blockingReasons, + }); + deps.logger.warn(message); + await appendWriteAudit("rejected", err); + throw err; + } const tmp = path.join( dir, @@ -1720,6 +1777,8 @@ export function createConfigIO(overrides: ConfigIoDeps = {}) { readSourceConfigBestEffort, readConfigFileSnapshot, readConfigFileSnapshotForWrite, + promoteConfigSnapshotToLastKnownGood, + recoverConfigFromLastKnownGood, writeConfigFile, }; } @@ -1820,6 +1879,19 @@ export async function readConfigFileSnapshot(): Promise { return await createConfigIO().readConfigFileSnapshot(); } +export async function promoteConfigSnapshotToLastKnownGood( + snapshot: ConfigFileSnapshot, +): Promise { + return await createConfigIO().promoteConfigSnapshotToLastKnownGood(snapshot); +} + +export async function recoverConfigFromLastKnownGood(params: { + snapshot: ConfigFileSnapshot; + reason: string; +}): Promise { + return await createConfigIO().recoverConfigFromLastKnownGood(params); +} + export async function readSourceConfigSnapshot(): Promise { return await readConfigFileSnapshot(); } @@ -1853,6 +1925,7 @@ export async function writeConfigFile( envSnapshotForRestore: options.envSnapshotForRestore, }), unsetPaths: options.unsetPaths, + allowDestructiveWrite: options.allowDestructiveWrite, skipRuntimeSnapshotRefresh: options.skipRuntimeSnapshotRefresh, }); if ( diff --git a/src/config/io.write-config.test.ts b/src/config/io.write-config.test.ts index 14fefab83b9..21653a3bb44 100644 --- a/src/config/io.write-config.test.ts +++ b/src/config/io.write-config.test.ts @@ -233,6 +233,38 @@ describe("config io write", () => { }); }); + it("rejects destructive internal writes before replacing the config", async () => { + await withSuiteHome(async (home) => { + const configPath = path.join(home, ".openclaw", "openclaw.json"); + await fs.mkdir(path.dirname(configPath), { recursive: true }); + const original = { + gateway: { mode: "local" }, + channels: { telegram: { enabled: true, dmPolicy: "pairing" } }, + agents: { list: [{ id: "main", default: true, workspace: "/tmp/openclaw-main" }] }, + tools: { profile: "safe" }, + commands: { ownerDisplay: "hash" }, + }; + await fs.writeFile(configPath, `${JSON.stringify(original, null, 2)}\n`, "utf-8"); + const warn = vi.fn(); + const io = createConfigIO({ + env: { VITEST: "true" } as NodeJS.ProcessEnv, + homedir: () => home, + logger: { warn, error: vi.fn() }, + }); + + await expect(io.writeConfigFile({ update: { channel: "beta" } })).rejects.toMatchObject({ + code: "CONFIG_WRITE_REJECTED", + }); + + await expect(fs.readFile(configPath, "utf-8")).resolves.toBe( + `${JSON.stringify(original, null, 2)}\n`, + ); + const entries = await fs.readdir(path.dirname(configPath)); + expect(entries.some((entry) => entry.includes(".rejected."))).toBe(true); + expect(warn).toHaveBeenCalledWith(expect.stringContaining("Config write rejected:")); + }); + }); + it("does not inject include-only $schema into the root config during partial writes", async () => { await withSuiteHome(async (home) => { const configPath = path.join(home, ".openclaw", "openclaw.json"); diff --git a/src/gateway/config-recovery-notice.test.ts b/src/gateway/config-recovery-notice.test.ts new file mode 100644 index 00000000000..1516aade740 --- /dev/null +++ b/src/gateway/config-recovery-notice.test.ts @@ -0,0 +1,44 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { + drainSystemEvents, + peekSystemEvents, + resetSystemEventsForTest, +} from "../infra/system-events.js"; +import { + enqueueConfigRecoveryNotice, + formatConfigRecoveryNotice, +} from "./config-recovery-notice.js"; + +describe("config recovery notice", () => { + afterEach(() => { + resetSystemEventsForTest(); + }); + + it("formats a prompt-facing warning for recovered configs", () => { + expect( + formatConfigRecoveryNotice({ + phase: "startup", + reason: "startup-invalid-config", + configPath: "/home/test/.openclaw/openclaw.json", + }), + ).toBe( + "Config recovery warning: OpenClaw restored openclaw.json from the last-known-good backup during startup (startup-invalid-config). The rejected config was invalid and was preserved as a timestamped .clobbered.* file. Do not write openclaw.json again unless you validate the full config first.", + ); + }); + + it("queues the notice for the main agent session", () => { + expect( + enqueueConfigRecoveryNotice({ + cfg: {}, + phase: "reload", + reason: "reload-invalid-config", + configPath: "/home/test/.openclaw/openclaw.json", + }), + ).toBe(true); + + expect(peekSystemEvents("agent:main:main")).toHaveLength(1); + expect(drainSystemEvents("agent:main:main")[0]).toContain( + "Do not write openclaw.json again unless you validate the full config first.", + ); + }); +}); diff --git a/src/gateway/config-recovery-notice.ts b/src/gateway/config-recovery-notice.ts new file mode 100644 index 00000000000..ad7e45b2059 --- /dev/null +++ b/src/gateway/config-recovery-notice.ts @@ -0,0 +1,31 @@ +import path from "node:path"; +import { resolveMainSessionKey } from "../config/sessions/main-session.js"; +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { enqueueSystemEvent } from "../infra/system-events.js"; + +export type ConfigRecoveryNoticePhase = "startup" | "reload"; + +export function formatConfigRecoveryNotice(params: { + phase: ConfigRecoveryNoticePhase; + reason: string; + configPath: string; +}): string { + const configName = path.basename(params.configPath) || "openclaw.json"; + return [ + `Config recovery warning: OpenClaw restored ${configName} from the last-known-good backup during ${params.phase} (${params.reason}).`, + "The rejected config was invalid and was preserved as a timestamped .clobbered.* file.", + `Do not write ${configName} again unless you validate the full config first.`, + ].join(" "); +} + +export function enqueueConfigRecoveryNotice(params: { + cfg: OpenClawConfig; + phase: ConfigRecoveryNoticePhase; + reason: string; + configPath: string; +}): boolean { + return enqueueSystemEvent(formatConfigRecoveryNotice(params), { + sessionKey: resolveMainSessionKey(params.cfg), + contextKey: `config-recovery:${params.phase}:${params.reason}`, + }); +} diff --git a/src/gateway/config-reload.test.ts b/src/gateway/config-reload.test.ts index 94a426d6d20..54f2b615f56 100644 --- a/src/gateway/config-reload.test.ts +++ b/src/gateway/config-reload.test.ts @@ -368,7 +368,16 @@ function makeSnapshot(partial: Partial = {}): ConfigFileSnap function createReloaderHarness( readSnapshot: () => Promise, - options: { initialInternalWriteHash?: string | null } = {}, + options: { + initialInternalWriteHash?: string | null; + recoverSnapshot?: (snapshot: ConfigFileSnapshot, reason: string) => Promise; + promoteSnapshot?: (snapshot: ConfigFileSnapshot, reason: string) => Promise; + onRecovered?: (params: { + reason: string; + snapshot: ConfigFileSnapshot; + recoveredSnapshot: ConfigFileSnapshot; + }) => void | Promise; + } = {}, ) { const watcher = createWatcherMock(); vi.spyOn(chokidar, "watch").mockReturnValue(watcher as unknown as never); @@ -392,6 +401,9 @@ function createReloaderHarness( initialConfig: { gateway: { reload: { debounceMs: 0 } } }, initialInternalWriteHash: options.initialInternalWriteHash, readSnapshot, + recoverSnapshot: options.recoverSnapshot, + promoteSnapshot: options.promoteSnapshot, + onRecovered: options.onRecovered, subscribeToWrites, onHotReload, onRestart, @@ -515,6 +527,145 @@ describe("startGatewayConfigReloader", () => { } }); + it("restores last-known-good on invalid external config edits and reloads recovered snapshot", async () => { + const readSnapshot = vi + .fn<() => Promise>() + .mockResolvedValueOnce( + makeSnapshot({ + valid: false, + raw: "{ gateway: { mode: 123 } }", + issues: [{ path: "gateway.mode", message: "Expected string" }], + hash: "bad-1", + }), + ) + .mockResolvedValueOnce( + makeSnapshot({ + config: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + hash: "last-good-1", + }), + ); + const recoverSnapshot = vi.fn(async () => true); + const promoteSnapshot = vi.fn(async () => true); + const onRecovered = vi.fn(); + const { watcher, onHotReload, onRestart, log, reloader } = createReloaderHarness(readSnapshot, { + recoverSnapshot, + promoteSnapshot, + onRecovered, + }); + + watcher.emit("change"); + await vi.runAllTimersAsync(); + + expect(recoverSnapshot).toHaveBeenCalledWith( + expect.objectContaining({ valid: false }), + "invalid-config", + ); + expect(readSnapshot).toHaveBeenCalledTimes(2); + expect(onRecovered).toHaveBeenCalledWith( + expect.objectContaining({ + reason: "invalid-config", + snapshot: expect.objectContaining({ valid: false }), + recoveredSnapshot: expect.objectContaining({ hash: "last-good-1" }), + }), + ); + expect(onHotReload).toHaveBeenCalledTimes(1); + expect(onRestart).not.toHaveBeenCalled(); + expect(promoteSnapshot).toHaveBeenCalledWith( + expect.objectContaining({ hash: "last-good-1" }), + "valid-config", + ); + expect(log.warn).toHaveBeenCalledWith( + "config reload restored last-known-good config after invalid-config", + ); + + await reloader.stop(); + }); + + it("promotes valid external config edits after they are accepted", async () => { + const acceptedSnapshot = makeSnapshot({ + config: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + hash: "external-good-1", + }); + const readSnapshot = vi + .fn<() => Promise>() + .mockResolvedValueOnce(acceptedSnapshot); + const promoteSnapshot = vi.fn(async () => true); + const { watcher, onHotReload, reloader } = createReloaderHarness(readSnapshot, { + promoteSnapshot, + }); + + watcher.emit("change"); + await vi.runAllTimersAsync(); + + expect(onHotReload).toHaveBeenCalledTimes(1); + expect(promoteSnapshot).toHaveBeenCalledWith(acceptedSnapshot, "valid-config"); + + await reloader.stop(); + }); + + it("does not promote external config edits when hot reload rejects them", async () => { + const acceptedSnapshot = makeSnapshot({ + config: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + hash: "external-rejected-1", + }); + const readSnapshot = vi + .fn<() => Promise>() + .mockResolvedValueOnce(acceptedSnapshot); + const promoteSnapshot = vi.fn(async () => true); + const { watcher, onHotReload, log, reloader } = createReloaderHarness(readSnapshot, { + promoteSnapshot, + }); + onHotReload.mockRejectedValueOnce(new Error("reload refused")); + + watcher.emit("change"); + await vi.runAllTimersAsync(); + + expect(onHotReload).toHaveBeenCalledTimes(1); + expect(promoteSnapshot).not.toHaveBeenCalled(); + expect(log.error).toHaveBeenCalledWith("config reload failed: Error: reload refused"); + + await reloader.stop(); + }); + + it("keeps accepted external config reloads applied when last-known-good promotion fails", async () => { + const acceptedSnapshot = makeSnapshot({ + config: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + hash: "external-promotion-fails-1", + }); + const readSnapshot = vi + .fn<() => Promise>() + .mockResolvedValueOnce(acceptedSnapshot); + const promoteSnapshot = vi.fn(async () => { + throw new Error("disk full"); + }); + const { watcher, onHotReload, log, reloader } = createReloaderHarness(readSnapshot, { + promoteSnapshot, + }); + + watcher.emit("change"); + await vi.runAllTimersAsync(); + + expect(onHotReload).toHaveBeenCalledTimes(1); + expect(promoteSnapshot).toHaveBeenCalledWith(acceptedSnapshot, "valid-config"); + expect(log.warn).toHaveBeenCalledWith( + "config reload last-known-good promotion failed: Error: disk full", + ); + + await reloader.stop(); + }); + it("reuses in-process write notifications and dedupes watcher rereads by persisted hash", async () => { const readSnapshot = vi .fn<() => Promise>() @@ -534,6 +685,22 @@ describe("startGatewayConfigReloader", () => { hash: "internal-1", }), ) + .mockResolvedValueOnce( + makeSnapshot({ + sourceConfig: { + gateway: { reload: { debounceMs: 0 } }, + }, + runtimeConfig: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + config: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + hash: "internal-1", + }), + ) .mockResolvedValueOnce( makeSnapshot({ sourceConfig: { @@ -548,7 +715,8 @@ describe("startGatewayConfigReloader", () => { hash: "external-1", }), ); - const harness = createReloaderHarness(readSnapshot); + const promoteSnapshot = vi.fn(async () => true); + const harness = createReloaderHarness(readSnapshot, { promoteSnapshot }); harness.emitWrite({ configPath: "/tmp/openclaw.json", @@ -562,26 +730,68 @@ describe("startGatewayConfigReloader", () => { }); await vi.runOnlyPendingTimersAsync(); - expect(readSnapshot).not.toHaveBeenCalled(); - expect(harness.onHotReload).toHaveBeenCalledTimes(1); - - harness.watcher.emit("change"); - harness.watcher.emit("change"); - await vi.runOnlyPendingTimersAsync(); - expect(readSnapshot).toHaveBeenCalledTimes(1); expect(harness.onHotReload).toHaveBeenCalledTimes(1); + expect(promoteSnapshot).toHaveBeenCalledWith( + expect.objectContaining({ hash: "internal-1" }), + "in-process-write", + ); + harness.watcher.emit("change"); harness.watcher.emit("change"); await vi.runOnlyPendingTimersAsync(); expect(readSnapshot).toHaveBeenCalledTimes(2); expect(harness.onHotReload).toHaveBeenCalledTimes(1); + + harness.watcher.emit("change"); + await vi.runOnlyPendingTimersAsync(); + + expect(readSnapshot).toHaveBeenCalledTimes(3); + expect(harness.onHotReload).toHaveBeenCalledTimes(1); expect(harness.onRestart).toHaveBeenCalledTimes(1); await harness.reloader.stop(); }); + it("skips in-process promotion when the persisted file hash no longer matches the write", async () => { + const readSnapshot = vi.fn<() => Promise>().mockResolvedValueOnce( + makeSnapshot({ + sourceConfig: { + gateway: { reload: { debounceMs: 0 }, port: 19002 }, + }, + runtimeConfig: { + gateway: { reload: { debounceMs: 0 }, port: 19002 }, + }, + config: { + gateway: { reload: { debounceMs: 0 }, port: 19002 }, + }, + hash: "racing-external-edit", + }), + ); + const promoteSnapshot = vi.fn(async () => true); + const harness = createReloaderHarness(readSnapshot, { promoteSnapshot }); + + harness.emitWrite({ + configPath: "/tmp/openclaw.json", + sourceConfig: { gateway: { reload: { debounceMs: 0 } } }, + runtimeConfig: { + gateway: { reload: { debounceMs: 0 } }, + hooks: { enabled: true }, + }, + persistedHash: "internal-1", + writtenAtMs: Date.now(), + }); + await vi.runOnlyPendingTimersAsync(); + + expect(harness.onHotReload).toHaveBeenCalledTimes(1); + expect(readSnapshot).toHaveBeenCalledTimes(1); + expect(promoteSnapshot).not.toHaveBeenCalled(); + expect(harness.log.warn).not.toHaveBeenCalled(); + + await harness.reloader.stop(); + }); + it("dedupes the first watcher reread for startup internal writes", async () => { const readSnapshot = vi .fn<() => Promise>() diff --git a/src/gateway/config-reload.ts b/src/gateway/config-reload.ts index f6bbd48dba2..3e236b6bb95 100644 --- a/src/gateway/config-reload.ts +++ b/src/gateway/config-reload.ts @@ -104,6 +104,13 @@ export function startGatewayConfigReloader(opts: { readSnapshot: () => Promise; onHotReload: (plan: GatewayReloadPlan, nextConfig: OpenClawConfig) => Promise; onRestart: (plan: GatewayReloadPlan, nextConfig: OpenClawConfig) => void | Promise; + recoverSnapshot?: (snapshot: ConfigFileSnapshot, reason: string) => Promise; + promoteSnapshot?: (snapshot: ConfigFileSnapshot, reason: string) => Promise; + onRecovered?: (params: { + reason: string; + snapshot: ConfigFileSnapshot; + recoveredSnapshot: ConfigFileSnapshot; + }) => void | Promise; subscribeToWrites?: (listener: (event: ConfigWriteNotification) => void) => () => void; log: { info: (msg: string) => void; @@ -120,7 +127,7 @@ export function startGatewayConfigReloader(opts: { let stopped = false; let restartQueued = false; let missingConfigRetries = 0; - let pendingInProcessConfig: OpenClawConfig | null = null; + let pendingInProcessConfig: { config: OpenClawConfig; persistedHash: string } | null = null; let lastAppliedWriteHash = opts.initialInternalWriteHash ?? null; const scheduleAfter = (wait: number) => { @@ -180,6 +187,32 @@ export function startGatewayConfigReloader(opts: { return true; }; + const recoverAndReadSnapshot = async ( + snapshot: ConfigFileSnapshot, + reason: string, + ): Promise => { + if (!opts.recoverSnapshot) { + return null; + } + const recovered = await opts.recoverSnapshot(snapshot, reason); + if (!recovered) { + return null; + } + opts.log.warn(`config reload restored last-known-good config after ${reason}`); + const nextSnapshot = await opts.readSnapshot(); + if (!nextSnapshot.valid) { + const issues = formatConfigIssueLines(nextSnapshot.issues, "").join(", "); + opts.log.warn(`config reload recovery snapshot is invalid: ${issues}`); + return null; + } + try { + await opts.onRecovered?.({ reason, snapshot, recoveredSnapshot: nextSnapshot }); + } catch (err) { + opts.log.warn(`config reload recovery notice failed: ${String(err)}`); + } + return nextSnapshot; + }; + const applySnapshot = async (nextConfig: OpenClawConfig) => { const changedPaths = diffConfigPaths(currentConfig, nextConfig); currentConfig = nextConfig; @@ -224,6 +257,32 @@ export function startGatewayConfigReloader(opts: { await opts.onHotReload(plan, nextConfig); }; + const promoteAcceptedSnapshot = async (snapshot: ConfigFileSnapshot, reason: string) => { + if (!opts.promoteSnapshot || !snapshot.exists || !snapshot.valid) { + return; + } + try { + await opts.promoteSnapshot(snapshot, reason); + } catch (err) { + opts.log.warn(`config reload last-known-good promotion failed: ${String(err)}`); + } + }; + + const promoteAcceptedInProcessWrite = async (persistedHash: string) => { + if (!opts.promoteSnapshot) { + return; + } + try { + const snapshot = await opts.readSnapshot(); + if (snapshot.hash !== persistedHash || !snapshot.valid) { + return; + } + await promoteAcceptedSnapshot(snapshot, "in-process-write"); + } catch (err) { + opts.log.warn(`config reload in-process last-known-good promotion failed: ${String(err)}`); + } + }; + const runReload = async () => { if (stopped) { return; @@ -239,13 +298,14 @@ export function startGatewayConfigReloader(opts: { } try { if (pendingInProcessConfig) { - const nextConfig = pendingInProcessConfig; + const pendingWrite = pendingInProcessConfig; pendingInProcessConfig = null; missingConfigRetries = 0; - await applySnapshot(nextConfig); + await applySnapshot(pendingWrite.config); + await promoteAcceptedInProcessWrite(pendingWrite.persistedHash); return; } - const snapshot = await opts.readSnapshot(); + let snapshot = await opts.readSnapshot(); if (lastAppliedWriteHash && typeof snapshot.hash === "string") { if (snapshot.hash === lastAppliedWriteHash) { return; @@ -255,10 +315,16 @@ export function startGatewayConfigReloader(opts: { if (handleMissingSnapshot(snapshot)) { return; } - if (handleInvalidSnapshot(snapshot)) { - return; + if (!snapshot.valid) { + const recoveredSnapshot = await recoverAndReadSnapshot(snapshot, "invalid-config"); + if (!recoveredSnapshot) { + handleInvalidSnapshot(snapshot); + return; + } + snapshot = recoveredSnapshot; } await applySnapshot(snapshot.config); + await promoteAcceptedSnapshot(snapshot, "valid-config"); } catch (err) { opts.log.error(`config reload failed: ${String(err)}`); } finally { @@ -285,7 +351,10 @@ export function startGatewayConfigReloader(opts: { if (event.configPath !== opts.watchPath) { return; } - pendingInProcessConfig = event.runtimeConfig; + pendingInProcessConfig = { + config: event.runtimeConfig, + persistedHash: event.persistedHash, + }; lastAppliedWriteHash = event.persistedHash; scheduleAfter(0); }) ?? (() => {}); diff --git a/src/gateway/server-reload-handlers.ts b/src/gateway/server-reload-handlers.ts index a5696c4ac47..e71663cc007 100644 --- a/src/gateway/server-reload-handlers.ts +++ b/src/gateway/server-reload-handlers.ts @@ -23,6 +23,7 @@ import { } from "../secrets/runtime.js"; import { getInspectableTaskRegistrySummary } from "../tasks/task-registry.maintenance.js"; import type { ChannelHealthMonitor } from "./channel-health-monitor.js"; +import { enqueueConfigRecoveryNotice } from "./config-recovery-notice.js"; import type { ChannelKind } from "./config-reload-plan.js"; import { startGatewayConfigReloader, type GatewayReloadPlan } from "./config-reload.js"; import { resolveHooksConfig } from "./hooks.js"; @@ -82,6 +83,8 @@ type ManagedGatewayConfigReloaderParams = Omit< initialInternalWriteHash: string | null; watchPath: string; readSnapshot: typeof import("../config/config.js").readConfigFileSnapshot; + recoverSnapshot: typeof import("../config/config.js").recoverConfigFromLastKnownGood; + promoteSnapshot: typeof import("../config/config.js").promoteConfigSnapshotToLastKnownGood; subscribeToWrites: typeof import("../config/config.js").registerConfigWriteListener; logReload: GatewayReloadLog & { error: (msg: string) => void; @@ -300,6 +303,17 @@ export function startManagedGatewayConfigReloader(params: ManagedGatewayConfigRe initialConfig: params.initialConfig, initialInternalWriteHash: params.initialInternalWriteHash, readSnapshot: params.readSnapshot, + recoverSnapshot: async (snapshot, reason) => + await params.recoverSnapshot({ snapshot, reason: `reload-${reason}` }), + promoteSnapshot: async (snapshot, _reason) => await params.promoteSnapshot(snapshot), + onRecovered: ({ reason, snapshot, recoveredSnapshot }) => { + enqueueConfigRecoveryNotice({ + cfg: recoveredSnapshot.config, + phase: "reload", + reason: `reload-${reason}`, + configPath: snapshot.path, + }); + }, subscribeToWrites: params.subscribeToWrites, onHotReload: async (plan, nextConfig) => { const previousSharedGatewaySessionGeneration = diff --git a/src/gateway/server-startup-config.recovery.test.ts b/src/gateway/server-startup-config.recovery.test.ts new file mode 100644 index 00000000000..eb1cdbb0629 --- /dev/null +++ b/src/gateway/server-startup-config.recovery.test.ts @@ -0,0 +1,107 @@ +import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ConfigFileSnapshot, OpenClawConfig } from "../config/types.js"; +import { buildTestConfigSnapshot } from "./test-helpers.config-snapshots.js"; + +vi.mock("../config/config.js", () => ({ + applyConfigOverrides: vi.fn((config: OpenClawConfig) => config), + isNixMode: false, + readConfigFileSnapshot: vi.fn(), + recoverConfigFromLastKnownGood: vi.fn(), + writeConfigFile: vi.fn(), +})); + +vi.mock("./config-recovery-notice.js", () => ({ + enqueueConfigRecoveryNotice: vi.fn(), +})); + +let loadGatewayStartupConfigSnapshot: typeof import("./server-startup-config.js").loadGatewayStartupConfigSnapshot; +let configIo: typeof import("../config/config.js"); +let recoveryNotice: typeof import("./config-recovery-notice.js"); + +const configPath = "/tmp/openclaw-startup-recovery.json"; +const validConfig = { + gateway: { + mode: "local", + }, +} as OpenClawConfig; + +function buildSnapshot(params: { + valid: boolean; + raw: string; + config?: OpenClawConfig; +}): ConfigFileSnapshot { + return buildTestConfigSnapshot({ + path: configPath, + exists: true, + raw: params.raw, + parsed: params.config ?? null, + valid: params.valid, + config: params.config ?? ({} as OpenClawConfig), + issues: params.valid ? [] : [{ path: "gateway.mode", message: "Expected 'local' or 'remote'" }], + legacyIssues: [], + }); +} + +describe("gateway startup config recovery", () => { + beforeAll(async () => { + ({ loadGatewayStartupConfigSnapshot } = await import("./server-startup-config.js")); + configIo = await import("../config/config.js"); + recoveryNotice = await import("./config-recovery-notice.js"); + }); + + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("restores last-known-good config before startup validation", async () => { + const invalidSnapshot = buildSnapshot({ valid: false, raw: "{ invalid json" }); + const recoveredSnapshot = buildSnapshot({ + valid: true, + raw: `${JSON.stringify(validConfig)}\n`, + config: validConfig, + }); + vi.mocked(configIo.readConfigFileSnapshot) + .mockResolvedValueOnce(invalidSnapshot) + .mockResolvedValueOnce(recoveredSnapshot); + vi.mocked(configIo.recoverConfigFromLastKnownGood).mockResolvedValueOnce(true); + const log = { info: vi.fn(), warn: vi.fn() }; + + await expect( + loadGatewayStartupConfigSnapshot({ + minimalTestGateway: true, + log, + }), + ).resolves.toBe(recoveredSnapshot); + + expect(configIo.recoverConfigFromLastKnownGood).toHaveBeenCalledWith({ + snapshot: invalidSnapshot, + reason: "startup-invalid-config", + }); + expect(log.warn).toHaveBeenCalledWith( + `gateway: invalid config was restored from last-known-good backup: ${configPath}`, + ); + expect(recoveryNotice.enqueueConfigRecoveryNotice).toHaveBeenCalledWith({ + cfg: recoveredSnapshot.config, + phase: "startup", + reason: "startup-invalid-config", + configPath, + }); + }); + + it("keeps startup validation loud when last-known-good recovery is unavailable", async () => { + const invalidSnapshot = buildSnapshot({ valid: false, raw: "{ invalid json" }); + vi.mocked(configIo.readConfigFileSnapshot).mockResolvedValueOnce(invalidSnapshot); + vi.mocked(configIo.recoverConfigFromLastKnownGood).mockResolvedValueOnce(false); + + await expect( + loadGatewayStartupConfigSnapshot({ + minimalTestGateway: true, + log: { info: vi.fn(), warn: vi.fn() }, + }), + ).rejects.toThrow( + `Invalid config at ${configPath}.\ngateway.mode: Expected 'local' or 'remote'\nRun "openclaw doctor --fix" to repair, then retry.`, + ); + + expect(recoveryNotice.enqueueConfigRecoveryNotice).not.toHaveBeenCalled(); + }); +}); diff --git a/src/gateway/server-startup-config.ts b/src/gateway/server-startup-config.ts index c5d9dc1717c..516b1f2d34c 100644 --- a/src/gateway/server-startup-config.ts +++ b/src/gateway/server-startup-config.ts @@ -7,6 +7,7 @@ import { applyConfigOverrides, isNixMode, readConfigFileSnapshot, + recoverConfigFromLastKnownGood, writeConfigFile, } from "../config/config.js"; import { formatConfigIssueLines } from "../config/issue-format.js"; @@ -21,6 +22,7 @@ import { prepareSecretsRuntimeSnapshot, } from "../secrets/runtime.js"; import { resolveGatewayAuth } from "./auth.js"; +import { enqueueConfigRecoveryNotice } from "./config-recovery-notice.js"; import { assertGatewayAuthNotKnownWeak } from "./known-weak-gateway-secrets.js"; import { ensureGatewayStartupAuth, @@ -60,6 +62,26 @@ export async function loadGatewayStartupConfigSnapshot(params: { ); } if (configSnapshot.exists) { + if (!configSnapshot.valid) { + const recovered = await recoverConfigFromLastKnownGood({ + snapshot: configSnapshot, + reason: "startup-invalid-config", + }); + if (recovered) { + params.log.warn( + `gateway: invalid config was restored from last-known-good backup: ${configSnapshot.path}`, + ); + configSnapshot = await readConfigFileSnapshot(); + if (configSnapshot.valid) { + enqueueConfigRecoveryNotice({ + cfg: configSnapshot.config, + phase: "startup", + reason: "startup-invalid-config", + configPath: configSnapshot.path, + }); + } + } + } assertValidGatewayStartupConfigSnapshot(configSnapshot, { includeDoctorHint: true }); } diff --git a/src/gateway/server.impl.ts b/src/gateway/server.impl.ts index 2bbc3367c0a..7611810e678 100644 --- a/src/gateway/server.impl.ts +++ b/src/gateway/server.impl.ts @@ -10,7 +10,9 @@ import { getRuntimeConfig, isNixMode, loadConfig, + promoteConfigSnapshotToLastKnownGood, readConfigFileSnapshot, + recoverConfigFromLastKnownGood, registerConfigWriteListener, writeConfigFile, } from "../config/config.js"; @@ -243,6 +245,7 @@ export async function startGatewayServer( let cfgAtStart: OpenClawConfig; let startupInternalWriteHash: string | null = null; + let startupLastGoodSnapshot = configSnapshot; const startupRuntimeConfig = applyConfigOverrides(configSnapshot.config); const authBootstrap = await prepareGatewayStartupConfig({ configSnapshot, @@ -294,6 +297,7 @@ export async function startGatewayServer( { const startupSnapshot = await readConfigFileSnapshot(); startupInternalWriteHash = startupSnapshot.hash ?? null; + startupLastGoodSnapshot = startupSnapshot; } const pluginBootstrap = await prepareGatewayPluginBootstrap({ cfgAtStart, @@ -782,6 +786,8 @@ export async function startGatewayServer( initialInternalWriteHash: startupInternalWriteHash, watchPath: configSnapshot.path, readSnapshot: readConfigFileSnapshot, + recoverSnapshot: recoverConfigFromLastKnownGood, + promoteSnapshot: promoteConfigSnapshotToLastKnownGood, subscribeToWrites: registerConfigWriteListener, deps, broadcast, @@ -812,6 +818,9 @@ export async function startGatewayServer( sharedGatewaySessionGenerationState, clients, }); + await promoteConfigSnapshotToLastKnownGood(startupLastGoodSnapshot).catch((err) => { + log.warn(`gateway: failed to promote config last-known-good backup: ${String(err)}`); + }); } catch (err) { await closeOnStartupFailure(); throw err;