From 9eaca28ef715a3546da393b19053250ee8ff5aff Mon Sep 17 00:00:00 2001 From: Abner Shang <75654486+abnershang@users.noreply.github.com> Date: Sat, 9 May 2026 17:11:43 +0800 Subject: [PATCH] fix(backup): retry tar EOF races and skip live volatile files Fixes #72249.\n\nSummary:\n- retry live backup tar EOF races\n- skip current live session, cron, log, and delivery-queue state files\n- preserve workspace lock/temp files and keep backup --json parseable\n\nVerification:\n- Crabbox pre-fix repro: tbx_01kr5xt9vf5pas5ee4aefrp3am\n- Crabbox post-fix proof: tbx_01kr5y3e1kbtt6chbypfdydbgs\n- pnpm check:test-types\n- pnpm lint:core\n- pnpm test src/commands/backup.test.ts src/infra/backup-volatile-filter.test.ts src/infra/backup-create.test.ts\n- CI on 37664570c73d0cd17723c96d4a5c07fee8bcb058: green\n\nThanks @abnershang. --- CHANGELOG.md | 1 + docs/cli/backup.md | 2 + src/commands/backup.test.ts | 44 +++++ src/commands/backup.ts | 5 +- src/infra/backup-create.test.ts | 214 +++++++++++++++++++++++ src/infra/backup-create.ts | 169 ++++++++++++++++-- src/infra/backup-volatile-filter.test.ts | 120 +++++++++++++ src/infra/backup-volatile-filter.ts | 131 ++++++++++++++ 8 files changed, 668 insertions(+), 18 deletions(-) create mode 100644 src/infra/backup-volatile-filter.test.ts create mode 100644 src/infra/backup-volatile-filter.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 244e90be316..090118d015f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net. +- Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang. - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358. - OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race. - OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions. diff --git a/docs/cli/backup.md b/docs/cli/backup.md index 8e7e583030a..1d50fc1b136 100644 --- a/docs/cli/backup.md +++ b/docs/cli/backup.md @@ -53,6 +53,8 @@ skipped. The archive payload stores file contents from those source trees, and the embedded `manifest.json` records the resolved absolute source paths plus the archive layout used for each asset. +During archive creation, OpenClaw skips known live-mutation files that do not have restoration value, including active agent session transcripts, cron run logs, rolling logs, delivery queues, socket/pid/temp files under the state directory, and related durable-queue temp files. The JSON result includes `skippedVolatileCount` so automation can see how many files were intentionally omitted. + Installed plugin source and manifest files under the state directory's `extensions/` tree are included, but their nested `node_modules/` dependency trees are skipped. Those dependencies are rebuildable install artifacts; after diff --git a/src/commands/backup.test.ts b/src/commands/backup.test.ts index 4e288d33a6d..49e843a6927 100644 --- a/src/commands/backup.test.ts +++ b/src/commands/backup.test.ts @@ -316,6 +316,50 @@ describe("backup commands", () => { } }); + it("keeps volatile-skip notices out of json output", async () => { + const stateDir = path.join(tempHome.home, ".openclaw"); + const backupDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-backups-json-")); + try { + const runtime = createBackupTestRuntime(); + await mockStateOnlyBackupPlan(stateDir); + tarCreateMock.mockImplementationOnce( + async ( + options: { file: string; filter?: (entryPath: string) => boolean }, + entryPaths: string[], + ) => { + const manifestPath = entryPaths[0]; + const stateRoot = entryPaths[1]; + expect(manifestPath).toBeDefined(); + expect(stateRoot).toBeDefined(); + if (!manifestPath || !stateRoot) { + throw new Error("backup test expected manifest and state entries"); + } + expect(options.filter?.(manifestPath)).toBe(true); + expect( + options.filter?.(path.join(stateRoot, "agents", "main", "sessions", "s.jsonl")), + ).toBe(false); + await fs.writeFile(options.file, "archive-bytes", "utf8"); + }, + ); + + const result = await backupCreateCommand(runtime, { + output: backupDir, + json: true, + }); + + expect(result.skippedVolatileCount).toBe(1); + expect(runtime.log).toHaveBeenCalledTimes(1); + const payload = vi.mocked(runtime.log).mock.calls[0]?.[0]; + if (typeof payload !== "string") { + throw new Error("backup test expected JSON string output"); + } + expect(payload).not.toContain("Backup skipped"); + expect(JSON.parse(payload)).toMatchObject({ skippedVolatileCount: 1 }); + } finally { + await fs.rm(backupDir, { recursive: true, force: true }); + } + }); + it("rejects output paths that would be created inside a backed-up directory", async () => { const stateDir = path.join(tempHome.home, ".openclaw"); await fs.writeFile(path.join(stateDir, "openclaw.json"), JSON.stringify({}), "utf8"); diff --git a/src/commands/backup.ts b/src/commands/backup.ts index 1b3362a1676..ded6edbe2c2 100644 --- a/src/commands/backup.ts +++ b/src/commands/backup.ts @@ -22,7 +22,10 @@ export async function backupCreateCommand( runtime: RuntimeEnv, opts: BackupCreateOptions = {}, ): Promise { - const result = await createBackupArchive(opts); + const result = await createBackupArchive({ + ...opts, + log: opts.log ?? (opts.json ? undefined : (message: string) => runtime.log(message)), + }); if (opts.verify && !opts.dryRun) { const { backupVerifyCommand } = await loadBackupVerifyRuntime(); await backupVerifyCommand( diff --git a/src/infra/backup-create.test.ts b/src/infra/backup-create.test.ts index 66dcc739980..53e3fafee53 100644 --- a/src/infra/backup-create.test.ts +++ b/src/infra/backup-create.test.ts @@ -6,6 +6,7 @@ import { backupVerifyCommand } from "../commands/backup-verify.js"; import type { RuntimeEnv } from "../runtime.js"; import { withOpenClawTestState } from "../test-utils/openclaw-test-state.js"; import { + __test as backupCreateInternals, buildExtensionsNodeModulesFilter, createBackupArchive, formatBackupCreateSummary, @@ -23,6 +24,7 @@ function makeResult(overrides: Partial = {}): BackupCreateRe verified: false, assets: [], skipped: [], + skippedVolatileCount: 0, ...overrides, }; } @@ -106,6 +108,159 @@ describe("formatBackupCreateSummary", () => { ])("$name", ({ result, expected }) => { expect(formatBackupCreateSummary(result)).toEqual(expected); }); + + it("surfaces the volatile skip count in the summary", () => { + expect( + formatBackupCreateSummary( + makeResult({ + assets: [ + { + kind: "state", + sourcePath: "/state", + archivePath: "archive/state", + displayPath: "~/.openclaw", + }, + ], + skippedVolatileCount: 3, + }), + ), + ).toEqual([ + "Backup archive: /tmp/openclaw-backup.tar.gz", + "Included 1 path:", + "- state: ~/.openclaw", + "Created /tmp/openclaw-backup.tar.gz", + "Skipped 3 volatile files (live sessions, cron logs, queues, sockets, pid/tmp).", + ]); + }); +}); + +describe("isTarEofRaceError", () => { + const { isTarEofRaceError } = backupCreateInternals; + + it.each([ + "did not encounter expected EOF", + "encountered unexpected EOF", + "TAR_BAD_ARCHIVE: Unrecognized archive format", + "Truncated input (needed 512 more bytes, only 0 available) (TAR_BAD_ARCHIVE)", + ])("matches tar-specific EOF-class error: %s", (message) => { + expect(isTarEofRaceError(new Error(message))).toBe(true); + }); + + it("matches errors by code even when the message is empty", () => { + expect(isTarEofRaceError(Object.assign(new Error(""), { code: "EOF" }))).toBe(true); + }); + + it.each([ + "EOF occurred in violation of protocol", + "unexpected eof while reading", + "ran out of EOF markers", + "permission denied", + "", + ])("does not match unrelated errors: %s", (message) => { + expect(isTarEofRaceError(new Error(message))).toBe(false); + }); + + it("rejects non-object inputs", () => { + expect(isTarEofRaceError(null)).toBe(false); + expect(isTarEofRaceError(undefined)).toBe(false); + expect(isTarEofRaceError("did not encounter expected EOF")).toBe(false); + }); +}); + +describe("writeTarArchiveWithRetry", () => { + it("retries on EOF-class errors and eventually succeeds", async () => { + const eofErr = Object.assign(new Error("did not encounter expected EOF"), { + path: "/state/sessions/s-abc/transcript.jsonl", + }); + const runTar = vi + .fn<() => Promise>() + .mockRejectedValueOnce(eofErr) + .mockRejectedValueOnce(eofErr) + .mockResolvedValueOnce(undefined); + const log = vi.fn(); + const sleep = vi.fn<(ms: number) => Promise>().mockResolvedValue(undefined); + + await backupCreateInternals.writeTarArchiveWithRetry({ + tempArchivePath: "/tmp/backup.tar.gz.tmp", + runTar, + log, + sleepMs: sleep, + }); + + expect(runTar).toHaveBeenCalledTimes(3); + expect(sleep).toHaveBeenNthCalledWith(1, 10_000); + expect(sleep).toHaveBeenNthCalledWith(2, 20_000); + expect(log).toHaveBeenCalledTimes(2); + }); + + it("surfaces the offending path and attempt count after exhausting retries", async () => { + const eofErr = Object.assign(new Error("did not encounter expected EOF"), { + path: "/state/logs/gateway.jsonl", + }); + const runTar = vi.fn<() => Promise>().mockRejectedValue(eofErr); + const sleep = vi.fn<(ms: number) => Promise>().mockResolvedValue(undefined); + + await expect( + backupCreateInternals.writeTarArchiveWithRetry({ + tempArchivePath: "/tmp/backup.tar.gz.tmp", + runTar, + sleepMs: sleep, + }), + ).rejects.toThrow(/last offending path: \/state\/logs\/gateway\.jsonl, after 3 attempts/); + expect(runTar).toHaveBeenCalledTimes(3); + }); + + it("lets callers reset per-attempt counters so retries report the final attempt's count, not a running sum", async () => { + // Simulate the caller's pattern: a closure counter populated by a filter + // that tar.c invokes while walking the tree. Each attempt re-walks the + // same tree, so the runTar closure must reset the counter before calling + // tar.c -- otherwise the reported count accumulates across attempts. + let skippedVolatileCount = 0; + const volatileFilesSeenPerAttempt = 5; + let attempt = 0; + + const eofErr = Object.assign(new Error("did not encounter expected EOF"), { + path: "/state/sessions/s-abc/transcript.jsonl", + }); + + const runTar = vi.fn<() => Promise>().mockImplementation(async () => { + attempt += 1; + skippedVolatileCount = 0; + for (let i = 0; i < volatileFilesSeenPerAttempt; i += 1) { + skippedVolatileCount += 1; + } + if (attempt < 3) { + throw eofErr; + } + }); + const sleep = vi.fn<(ms: number) => Promise>().mockResolvedValue(undefined); + + await backupCreateInternals.writeTarArchiveWithRetry({ + tempArchivePath: "/tmp/backup.tar.gz.tmp", + runTar, + sleepMs: sleep, + }); + + expect(runTar).toHaveBeenCalledTimes(3); + // Without the reset, this would be 15 (5 * 3 attempts). With the reset, + // it equals the count from the final (successful) attempt. + expect(skippedVolatileCount).toBe(volatileFilesSeenPerAttempt); + }); + + it("does not retry on non-EOF errors", async () => { + const runTar = vi.fn<() => Promise>().mockRejectedValue(new Error("permission denied")); + const sleep = vi.fn<(ms: number) => Promise>().mockResolvedValue(undefined); + + await expect( + backupCreateInternals.writeTarArchiveWithRetry({ + tempArchivePath: "/tmp/backup.tar.gz.tmp", + runTar, + sleepMs: sleep, + }), + ).rejects.toThrow(/permission denied/); + expect(runTar).toHaveBeenCalledTimes(1); + expect(sleep).not.toHaveBeenCalled(); + }); }); describe("buildExtensionsNodeModulesFilter", () => { @@ -131,6 +286,65 @@ describe("buildExtensionsNodeModulesFilter", () => { }); describe("createBackupArchive", () => { + it("skips current live volatile state files while preserving workspace locks", async () => { + await withOpenClawTestState( + { + layout: "split", + prefix: "openclaw-backup-volatile-", + scenario: "minimal", + }, + async (state) => { + const outputDir = state.path("backups"); + await state.writeConfig({ + agents: { + list: [{ id: "main", default: true, workspace: state.workspaceDir }], + }, + }); + await fs.mkdir(outputDir, { recursive: true }); + await fs.writeFile(path.join(state.workspaceDir, "Cargo.lock"), "workspace lock\n", "utf8"); + await fs.writeFile( + path.join(state.workspaceDir, "pending.tmp"), + "workspace temp fixture\n", + "utf8", + ); + await state.writeText("agents/main/sessions/live-session.jsonl", "session\n"); + await state.writeText("sessions/legacy-session.jsonl", "legacy session\n"); + await state.writeText("cron/runs/nightly.jsonl", "cron\n"); + await state.writeText("logs/gateway.log", "log\n"); + await state.writeJson("delivery-queue/message.json", { id: "delivery" }); + await state.writeJson("session-delivery-queue/message.json", { id: "session-delivery" }); + await state.writeText("tmp/staged.tmp", "tmp\n"); + await state.writeText("gateway.pid", "123\n"); + + const result = await createBackupArchive({ + output: outputDir, + includeWorkspace: true, + nowMs: Date.UTC(2026, 4, 9, 8, 0, 0), + }); + const entries = await listArchiveEntries(result.archivePath); + + expect(entries.some((entry) => entry.endsWith("/workspace/Cargo.lock"))).toBe(true); + expect(entries.some((entry) => entry.endsWith("/workspace/pending.tmp"))).toBe(true); + for (const suffix of [ + "/state/agents/main/sessions/live-session.jsonl", + "/state/sessions/legacy-session.jsonl", + "/state/cron/runs/nightly.jsonl", + "/state/logs/gateway.log", + "/state/delivery-queue/message.json", + "/state/session-delivery-queue/message.json", + "/state/tmp/staged.tmp", + "/state/gateway.pid", + ]) { + expect( + entries.some((entry) => entry.endsWith(suffix)), + suffix, + ).toBe(false); + } + expect(result.skippedVolatileCount).toBe(8); + }, + ); + }); + it("omits installed plugin node_modules from the real archive while keeping plugin files", async () => { await withOpenClawTestState( { diff --git a/src/infra/backup-create.ts b/src/infra/backup-create.ts index 92bb158d1a4..70b9094497c 100644 --- a/src/infra/backup-create.ts +++ b/src/infra/backup-create.ts @@ -13,6 +13,7 @@ import { import { isPathWithin } from "../commands/cleanup-utils.js"; import { resolveHomeDir, resolveUserPath } from "../utils.js"; import { resolveRuntimeServiceVersion } from "../version.js"; +import { isVolatileBackupPath } from "./backup-volatile-filter.js"; import { writeJson } from "./json-files.js"; type TarRuntime = typeof import("tar"); @@ -32,6 +33,12 @@ export type BackupCreateOptions = { verify?: boolean; json?: boolean; nowMs?: number; + /** + * Optional info logger invoked for non-fatal backup events such as tar + * retry notices or volatile-file skip counts. When omitted, events are + * silent aside from the final result. + */ + log?: (message: string) => void; }; type BackupManifestAsset = { @@ -82,8 +89,91 @@ export type BackupCreateResult = { reason: string; coveredBy?: string; }>; + /** + * Count of files the archiver actively skipped because they matched the + * known-volatile filter (live sessions, cron logs, queues, sockets, pid/tmp). + * Populated on real writes only; dry runs report 0. + */ + skippedVolatileCount: number; }; +const BACKUP_TAR_MAX_ATTEMPTS = 3; +// Backoff between attempts: wait 10s before attempt 2, 20s before attempt 3. +const BACKUP_TAR_BACKOFF_MS = [10_000, 20_000]; + +function isTarEofRaceError(err: unknown): boolean { + if (!err || typeof err !== "object") { + return false; + } + const code = (err as NodeJS.ErrnoException).code; + if (code === "EOF") { + return true; + } + // Keep this regex narrow: match only the two tar-specific EOF-class error + // strings thrown by node-tar's WriteEntry#onread (grow and shrink races, + // see node_modules/tar/dist/commonjs/write-entry.js around the + // "did not encounter expected EOF" and "encountered unexpected EOF" + // Object.assign sites), plus the TAR_BAD_ARCHIVE code surfaced by the + // parser on truncated input. A bare /EOF/i alternative also matched + // unrelated SSL/OpenSSL strings like "EOF occurred in violation of + // protocol" and "unexpected eof while reading", causing pointless retries. + const message = (err as Error).message ?? ""; + return /(did not encounter expected|encountered unexpected) EOF|TAR_BAD_ARCHIVE/i.test(message); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export type BackupTarRetryLogger = (message: string) => void; + +async function writeTarArchiveWithRetry(params: { + tempArchivePath: string; + runTar: () => Promise; + log?: BackupTarRetryLogger; + sleepMs?: (ms: number) => Promise; +}): Promise { + const sleepFn = params.sleepMs ?? sleep; + let lastErr: unknown; + for (let attempt = 1; attempt <= BACKUP_TAR_MAX_ATTEMPTS; attempt += 1) { + try { + await params.runTar(); + return; + } catch (err) { + lastErr = err; + if (!isTarEofRaceError(err) || attempt === BACKUP_TAR_MAX_ATTEMPTS) { + break; + } + try { + await fs.rm(params.tempArchivePath, { force: true }); + } catch (cleanupErr) { + const code = (cleanupErr as NodeJS.ErrnoException).code; + if (code && code !== "ENOENT") { + params.log?.( + `Backup archiver could not remove temp archive ${params.tempArchivePath} between retries: ${code}. Continuing.`, + ); + } + } + const backoff = BACKUP_TAR_BACKOFF_MS[attempt - 1] ?? 0; + const offendingPath = (err as NodeJS.ErrnoException).path; + params.log?.( + `Backup archiver hit a live-write race${ + offendingPath ? ` on ${offendingPath}` : "" + } (attempt ${attempt}/${BACKUP_TAR_MAX_ATTEMPTS}); retrying in ${Math.round(backoff / 1000)}s.`, + ); + await sleepFn(backoff); + } + } + const final = lastErr instanceof Error ? lastErr : new Error(String(lastErr)); + const offendingPath = (lastErr as NodeJS.ErrnoException | undefined)?.path; + const suffix = offendingPath + ? ` (last offending path: ${offendingPath}, after ${BACKUP_TAR_MAX_ATTEMPTS} attempts)` + : ` (after ${BACKUP_TAR_MAX_ATTEMPTS} attempts)`; + throw new Error(`Backup archive write failed: ${final.message}${suffix}`, { cause: final }); +} + +export const __test = { writeTarArchiveWithRetry, isTarEofRaceError }; + async function resolveOutputPath(params: { output?: string; nowMs: number; @@ -259,6 +349,13 @@ export function formatBackupCreateSummary(result: BackupCreateResult): string[] lines.push("Dry run only; archive was not written."); } else { lines.push(`Created ${result.archivePath}`); + if (result.skippedVolatileCount > 0) { + lines.push( + `Skipped ${result.skippedVolatileCount} volatile file${ + result.skippedVolatileCount === 1 ? "" : "s" + } (live sessions, cron logs, queues, sockets, pid/tmp).`, + ); + } if (result.verified) { lines.push("Archive verification: passed"); } @@ -344,6 +441,7 @@ export async function createBackupArchive( verified: false, assets: plan.included, skipped: plan.skipped, + skippedVolatileCount: 0, }; if (opts.dryRun) { @@ -371,24 +469,61 @@ export async function createBackupArchive( const tar = await loadTarRuntime(); const stateAsset = result.assets.find((asset) => asset.kind === "state"); - const filter = stateAsset ? buildExtensionsNodeModulesFilter(stateAsset.sourcePath) : undefined; - await tar.c( - { - file: tempArchivePath, - ...(filter ? { filter } : {}), - gzip: true, - portable: true, - preservePaths: true, - onWriteEntry: (entry) => { - entry.path = remapArchiveEntryPath({ - entryPath: entry.path, - manifestPath, - archiveRoot, - }); - }, + const extensionsFilter = stateAsset + ? buildExtensionsNodeModulesFilter(stateAsset.sourcePath) + : undefined; + const volatilePlan = { stateDirs: [stateAsset?.sourcePath ?? plan.stateDir] }; + let skippedVolatileCount = 0; + const tarFilter = (entryPath: string): boolean => { + // The manifest is staged in a tmp dir outside any state directory and + // is always safe to include. + if (path.resolve(entryPath) === manifestPath) { + return true; + } + if (extensionsFilter && !extensionsFilter(entryPath)) { + return false; + } + if (isVolatileBackupPath(entryPath, volatilePlan)) { + skippedVolatileCount += 1; + return false; + } + return true; + }; + await writeTarArchiveWithRetry({ + tempArchivePath, + log: opts.log, + runTar: () => { + // tar.c re-walks the tree (and thus re-invokes tarFilter) on every + // attempt, so reset the closure counter here or retries would report + // cumulative skip counts across attempts instead of the final one. + skippedVolatileCount = 0; + return tar.c( + { + file: tempArchivePath, + gzip: true, + portable: true, + preservePaths: true, + filter: tarFilter, + onWriteEntry: (entry) => { + entry.path = remapArchiveEntryPath({ + entryPath: entry.path, + manifestPath, + archiveRoot, + }); + }, + }, + [manifestPath, ...result.assets.map((asset) => asset.sourcePath)], + ); }, - [manifestPath, ...result.assets.map((asset) => asset.sourcePath)], - ); + }); + result.skippedVolatileCount = skippedVolatileCount; + if (skippedVolatileCount > 0) { + opts.log?.( + `Backup skipped ${skippedVolatileCount} volatile file${ + skippedVolatileCount === 1 ? "" : "s" + } (live sessions, cron logs, queues, sockets, pid/tmp).`, + ); + } await publishTempArchive({ tempArchivePath, outputPath }); } finally { await fs.rm(tempArchivePath, { force: true }).catch(() => undefined); diff --git a/src/infra/backup-volatile-filter.test.ts b/src/infra/backup-volatile-filter.test.ts new file mode 100644 index 00000000000..ff70794eabf --- /dev/null +++ b/src/infra/backup-volatile-filter.test.ts @@ -0,0 +1,120 @@ +import { describe, expect, it } from "vitest"; +import { isVolatileBackupPath } from "./backup-volatile-filter.js"; + +const stateDir = "/opt/openclaw/state"; +const plan = { stateDirs: [stateDir] }; + +describe("isVolatileBackupPath", () => { + it.each([ + // volatile: session transcripts + [`${stateDir}/sessions/s-abc/transcript.jsonl`, true], + [`${stateDir}/sessions/s-abc/run.log`, true], + [`${stateDir}/agents/main/sessions/transcript.jsonl`, true], + [`${stateDir}/agents/ops/sessions/run.log`, true], + // volatile: cron run logs + [`${stateDir}/cron/runs/2026-01-01/job.log`, true], + [`${stateDir}/cron/runs/nightly.jsonl`, true], + // volatile: generic state logs + [`${stateDir}/logs/gateway.jsonl`, true], + [`${stateDir}/logs/nested/gateway.log`, true], + // volatile: sockets/pids/tmp under state + [`${stateDir}/ipc/gateway.sock`, true], + [`${stateDir}/gateway.pid`, true], + [`${stateDir}/tmp/pending.tmp`, true], + [`${stateDir}/delivery-queue/pending.tmp`, true], + [`${stateDir}/session-delivery-queue/pending.tmp`, true], + + // non-volatile: session config, not jsonl/log + [`${stateDir}/sessions/s-abc/meta.json`, false], + [`${stateDir}/agents/main/sessions/sessions.json`, false], + // non-volatile: cron definitions + [`${stateDir}/cron/jobs.json`, false], + // non-volatile: cron runs but wrong extension + [`${stateDir}/cron/runs/2026-01-01/job.json`, false], + // non-volatile: plain config + [`${stateDir}/config.json`, false], + // non-volatile: workspace files outside state + ["/home/user/project/README.md", false], + ["/home/user/project/Cargo.lock", false], + ["/home/user/project/pending.tmp", false], + // non-volatile: log-like name outside scope + ["/home/user/notes/daily.log", false], + ])("classifies %s as volatile=%s", (p, expected) => { + expect(isVolatileBackupPath(p, plan)).toBe(expected); + }); + + it("returns false when no state dirs are provided", () => { + expect( + isVolatileBackupPath(`${stateDir}/sessions/s-abc/transcript.jsonl`, { stateDirs: [] }), + ).toBe(false); + }); + + it("does not skip transient extensions without a state anchor", () => { + expect(isVolatileBackupPath("/any/path/daemon.sock", { stateDirs: [] })).toBe(false); + expect(isVolatileBackupPath("/any/path/daemon.pid", { stateDirs: [] })).toBe(false); + expect(isVolatileBackupPath("/any/path/Cargo.lock", { stateDirs: [] })).toBe(false); + }); + + it("does not match paths that escape the anchor via `..`", () => { + // `/opt/openclaw/state/sessions/../config.jsonl` resolves to + // `/opt/openclaw/state/config.jsonl`, which is NOT inside sessions/. + expect(isVolatileBackupPath(`${stateDir}/sessions/../config.jsonl`, plan)).toBe(false); + expect(isVolatileBackupPath(`${stateDir}/cron/runs/../jobs.log`, plan)).toBe(false); + expect(isVolatileBackupPath(`${stateDir}/logs/../notes.jsonl`, plan)).toBe(false); + }); + + it("treats delivery-queue json files under stateDir as volatile", () => { + expect( + isVolatileBackupPath( + `${stateDir}/delivery-queue/3fac5e46-42dc-4230-a725-51c203830b4f.json`, + plan, + ), + ).toBe(true); + }); + + it("treats nested delivery-queue json files under stateDir as volatile", () => { + expect( + isVolatileBackupPath( + `${stateDir}/delivery-queue/subdir/3fac5e46-42dc-4230-a725-51c203830b4f.json`, + plan, + ), + ).toBe(true); + }); + + it("does not treat non-json delivery-queue files as volatile", () => { + expect(isVolatileBackupPath(`${stateDir}/delivery-queue/README.md`, plan)).toBe(false); + }); + + it("does not treat delivery-queue json outside stateDir as volatile", () => { + expect(isVolatileBackupPath(`/tmp/delivery-queue/file.json`, plan)).toBe(false); + }); + + it("normalizes Windows-style separators before anchor checks", () => { + const winStateDir = "C:\\openclaw\\state"; + const winPlan = { stateDirs: [winStateDir] }; + expect(isVolatileBackupPath(`${winStateDir}\\sessions\\s-abc\\transcript.jsonl`, winPlan)).toBe( + true, + ); + expect(isVolatileBackupPath(`${winStateDir}\\agents\\main\\sessions\\s.jsonl`, winPlan)).toBe( + true, + ); + expect(isVolatileBackupPath(`${winStateDir}\\cron\\runs\\2026\\job.jsonl`, winPlan)).toBe(true); + // `..` escape via backslashes must also be rejected. + expect(isVolatileBackupPath(`${winStateDir}\\sessions\\..\\config.jsonl`, winPlan)).toBe(false); + }); + + it("matches tar filter paths when node-tar omits the leading slash", () => { + expect( + isVolatileBackupPath("opt/openclaw/state/agents/main/sessions/transcript.jsonl", plan), + ).toBe(true); + }); + + it("treats session-delivery-queue json files under stateDir as volatile", () => { + expect( + isVolatileBackupPath( + `${stateDir}/session-delivery-queue/3fac5e46-42dc-4230-a725-51c203830b4f.json`, + plan, + ), + ).toBe(true); + }); +}); diff --git a/src/infra/backup-volatile-filter.ts b/src/infra/backup-volatile-filter.ts new file mode 100644 index 00000000000..dc7ca7aa7f9 --- /dev/null +++ b/src/infra/backup-volatile-filter.ts @@ -0,0 +1,131 @@ +import path from "node:path"; + +/** + * Paths that are known to change during a live backup and commonly trigger + * tar EOF errors. These files are actively appended to (logs, sockets, pid + * markers) while `tar.c()` is reading them, which races with the size recorded + * at `lstat()` time. + * + * Skipping them is safe: they are either recreated on startup, are transient + * by nature, or have durable equivalents elsewhere in state. Snapshotting a + * partial tail of a live log has no restoration value. + */ + +const STATE_TRANSIENT_EXTENSIONS = new Set([".sock", ".pid", ".tmp"]); + +function normalizePosix(input: string): string { + if (!input) { + return input; + } + // Swap Windows-style separators, then collapse `.`/`..` segments so ancestry + // checks cannot be bypassed by a path that traverses out of the anchor. + return path.posix.normalize(input.replaceAll("\\", "/")); +} + +function isUnder(childPosix: string, parentPosix: string): boolean { + if (!parentPosix) { + return false; + } + const p = parentPosix.endsWith("/") ? parentPosix : `${parentPosix}/`; + return childPosix === parentPosix || childPosix.startsWith(p); +} + +function hasExtension(filePosix: string, extensions: readonly string[]): boolean { + const ext = path.posix.extname(filePosix).toLowerCase(); + return extensions.includes(ext); +} + +function hasExtensionInSet(filePosix: string, extensions: ReadonlySet): boolean { + return extensions.has(path.posix.extname(filePosix).toLowerCase()); +} + +function isAgentSessionTranscriptPath(filePosix: string, stateDirPosix: string): boolean { + const agentsRoot = path.posix.join(stateDirPosix, "agents"); + if (!isUnder(filePosix, agentsRoot)) { + return false; + } + const relative = path.posix.relative(agentsRoot, filePosix); + const parts = relative.split("/").filter(Boolean); + return parts.length >= 3 && parts[1] === "sessions"; +} + +function filePathCandidates(input: string): string[] { + const normalized = normalizePosix(input); + if (normalized.startsWith("/") || /^[A-Za-z]:\//u.test(normalized)) { + return [normalized]; + } + // node-tar may pass absolute input paths to filters without the leading + // slash, even when the source list used absolute paths. + return [normalized, normalizePosix(`/${normalized}`)]; +} + +export type VolatileFilterPlan = { + /** Canonical state directories the filter should treat as volatile anchors. */ + stateDirs: string[]; +}; + +/** + * Returns true if the given absolute path should be skipped during backup + * because it is a live-mutation target. + * + * Rules: + * - `{stateDir}/sessions/**`/`*.{jsonl,log}` (legacy) + * - `{stateDir}/agents//sessions/**`/`*.{jsonl,log}` + * - `{stateDir}/cron/runs/**`/`*.{jsonl,log}` + * - `{stateDir}/logs/**`/`*.{jsonl,log}` + * - `{stateDir}/{delivery-queue,session-delivery-queue}/**`/`*.{json,tmp}` + * - `{stateDir}/**`/`*.{sock,pid,tmp}` + */ +export function isVolatileBackupPath(absolutePath: string, plan: VolatileFilterPlan): boolean { + if (!absolutePath) { + return false; + } + const candidates = filePathCandidates(absolutePath); + + for (const stateDir of plan.stateDirs) { + if (!stateDir) { + continue; + } + const stateDirPosix = normalizePosix(stateDir); + + for (const filePosix of candidates) { + const sessionsRoot = path.posix.join(stateDirPosix, "sessions"); + if (isUnder(filePosix, sessionsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) { + return true; + } + + if ( + isAgentSessionTranscriptPath(filePosix, stateDirPosix) && + hasExtension(filePosix, [".jsonl", ".log"]) + ) { + return true; + } + + const cronRunsRoot = path.posix.join(stateDirPosix, "cron", "runs"); + if (isUnder(filePosix, cronRunsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) { + return true; + } + + const logsRoot = path.posix.join(stateDirPosix, "logs"); + if (isUnder(filePosix, logsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) { + return true; + } + + for (const queueDir of ["delivery-queue", "session-delivery-queue"]) { + const queueRoot = path.posix.join(stateDirPosix, queueDir); + if (isUnder(filePosix, queueRoot) && hasExtension(filePosix, [".json", ".tmp"])) { + return true; + } + } + + if ( + isUnder(filePosix, stateDirPosix) && + hasExtensionInSet(filePosix, STATE_TRANSIENT_EXTENSIONS) + ) { + return true; + } + } + } + + return false; +}