fix(backup): retry tar EOF races and skip live volatile files

Fixes #72249.\n\nSummary:\n- retry live backup tar EOF races\n- skip current live session, cron, log, and delivery-queue state files\n- preserve workspace lock/temp files and keep backup --json parseable\n\nVerification:\n- Crabbox pre-fix repro: tbx_01kr5xt9vf5pas5ee4aefrp3am\n- Crabbox post-fix proof: tbx_01kr5y3e1kbtt6chbypfdydbgs\n- pnpm check:test-types\n- pnpm lint:core\n- pnpm test src/commands/backup.test.ts src/infra/backup-volatile-filter.test.ts src/infra/backup-create.test.ts\n- CI on 37664570c7: green\n\nThanks @abnershang.
2026-05-15 12:20:53 +00:00 · 2026-05-09 17:11:43 +08:00
parent 7d91fcbe21
commit 9eaca28ef7
8 changed files with 668 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -146,6 +146,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - Docs/Subagents: correct the listed sub-agent bootstrap context files to include `SOUL.md`, `IDENTITY.md`, and `USER.md`. (#79470) Thanks @lastguru-net.
+- Backup: keep live backup archives from copying current agent session transcripts, cron run logs, and delivery queues while preserving workspace lock/temp files and keeping `--json` output parseable when volatile files are skipped. Fixes #72249. (#72251) Thanks @abnershang.
 - OpenAI/Codex: install the Codex runtime plugin from npm during OpenAI onboarding and load it automatically for implicit OpenAI model routes, while preserving manual PI runtime overrides. Fixes #79358.
 - OpenAI/realtime voice: defer `response.create` while a realtime response is still active, retry after `response.done`/`response.cancelled`, and align GA input transcription/noise-reduction defaults with the Codex realtime reference so Discord/Voice Call consult results can resume speaking instead of tripping the active-response race.
 - OpenAI/realtime voice: avoid duplicate barge-in cancellation requests, log realtime model interruption/cutoff events in Discord voice logs, and treat OpenAI's no-active-response cancellation reply as a completed cancel so Discord voice sessions do not wedge pending speech after fast interruptions.
--- a/docs/cli/backup.md
+++ b/docs/cli/backup.md
@@ -53,6 +53,8 @@ skipped.

 The archive payload stores file contents from those source trees, and the embedded `manifest.json` records the resolved absolute source paths plus the archive layout used for each asset.

+During archive creation, OpenClaw skips known live-mutation files that do not have restoration value, including active agent session transcripts, cron run logs, rolling logs, delivery queues, socket/pid/temp files under the state directory, and related durable-queue temp files. The JSON result includes `skippedVolatileCount` so automation can see how many files were intentionally omitted.
+
 Installed plugin source and manifest files under the state directory's
 `extensions/` tree are included, but their nested `node_modules/` dependency
 trees are skipped. Those dependencies are rebuildable install artifacts; after
--- a/src/commands/backup.test.ts
+++ b/src/commands/backup.test.ts
@@ -316,6 +316,50 @@ describe("backup commands", () => {
    }
  });

+  it("keeps volatile-skip notices out of json output", async () => {
+    const stateDir = path.join(tempHome.home, ".openclaw");
+    const backupDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-backups-json-"));
+    try {
+      const runtime = createBackupTestRuntime();
+      await mockStateOnlyBackupPlan(stateDir);
+      tarCreateMock.mockImplementationOnce(
+        async (
+          options: { file: string; filter?: (entryPath: string) => boolean },
+          entryPaths: string[],
+        ) => {
+          const manifestPath = entryPaths[0];
+          const stateRoot = entryPaths[1];
+          expect(manifestPath).toBeDefined();
+          expect(stateRoot).toBeDefined();
+          if (!manifestPath || !stateRoot) {
+            throw new Error("backup test expected manifest and state entries");
+          }
+          expect(options.filter?.(manifestPath)).toBe(true);
+          expect(
+            options.filter?.(path.join(stateRoot, "agents", "main", "sessions", "s.jsonl")),
+          ).toBe(false);
+          await fs.writeFile(options.file, "archive-bytes", "utf8");
+        },
+      );
+
+      const result = await backupCreateCommand(runtime, {
+        output: backupDir,
+        json: true,
+      });
+
+      expect(result.skippedVolatileCount).toBe(1);
+      expect(runtime.log).toHaveBeenCalledTimes(1);
+      const payload = vi.mocked(runtime.log).mock.calls[0]?.[0];
+      if (typeof payload !== "string") {
+        throw new Error("backup test expected JSON string output");
+      }
+      expect(payload).not.toContain("Backup skipped");
+      expect(JSON.parse(payload)).toMatchObject({ skippedVolatileCount: 1 });
+    } finally {
+      await fs.rm(backupDir, { recursive: true, force: true });
+    }
+  });
+
  it("rejects output paths that would be created inside a backed-up directory", async () => {
    const stateDir = path.join(tempHome.home, ".openclaw");
    await fs.writeFile(path.join(stateDir, "openclaw.json"), JSON.stringify({}), "utf8");
--- a/src/commands/backup.ts
+++ b/src/commands/backup.ts
@@ -22,7 +22,10 @@ export async function backupCreateCommand(
  runtime: RuntimeEnv,
  opts: BackupCreateOptions = {},
 ): Promise<BackupCreateResult> {
-  const result = await createBackupArchive(opts);
+  const result = await createBackupArchive({
+    ...opts,
+    log: opts.log ?? (opts.json ? undefined : (message: string) => runtime.log(message)),
+  });
  if (opts.verify && !opts.dryRun) {
    const { backupVerifyCommand } = await loadBackupVerifyRuntime();
    await backupVerifyCommand(
--- a/src/infra/backup-create.test.ts
+++ b/src/infra/backup-create.test.ts
@@ -6,6 +6,7 @@ import { backupVerifyCommand } from "../commands/backup-verify.js";
 import type { RuntimeEnv } from "../runtime.js";
 import { withOpenClawTestState } from "../test-utils/openclaw-test-state.js";
 import {
+  __test as backupCreateInternals,
  buildExtensionsNodeModulesFilter,
  createBackupArchive,
  formatBackupCreateSummary,
@@ -23,6 +24,7 @@ function makeResult(overrides: Partial<BackupCreateResult> = {}): BackupCreateRe
    verified: false,
    assets: [],
    skipped: [],
+    skippedVolatileCount: 0,
    ...overrides,
  };
 }
@@ -106,6 +108,159 @@ describe("formatBackupCreateSummary", () => {
  ])("$name", ({ result, expected }) => {
    expect(formatBackupCreateSummary(result)).toEqual(expected);
  });
+
+  it("surfaces the volatile skip count in the summary", () => {
+    expect(
+      formatBackupCreateSummary(
+        makeResult({
+          assets: [
+            {
+              kind: "state",
+              sourcePath: "/state",
+              archivePath: "archive/state",
+              displayPath: "~/.openclaw",
+            },
+          ],
+          skippedVolatileCount: 3,
+        }),
+      ),
+    ).toEqual([
+      "Backup archive: /tmp/openclaw-backup.tar.gz",
+      "Included 1 path:",
+      "- state: ~/.openclaw",
+      "Created /tmp/openclaw-backup.tar.gz",
+      "Skipped 3 volatile files (live sessions, cron logs, queues, sockets, pid/tmp).",
+    ]);
+  });
+});
+
+describe("isTarEofRaceError", () => {
+  const { isTarEofRaceError } = backupCreateInternals;
+
+  it.each([
+    "did not encounter expected EOF",
+    "encountered unexpected EOF",
+    "TAR_BAD_ARCHIVE: Unrecognized archive format",
+    "Truncated input (needed 512 more bytes, only 0 available) (TAR_BAD_ARCHIVE)",
+  ])("matches tar-specific EOF-class error: %s", (message) => {
+    expect(isTarEofRaceError(new Error(message))).toBe(true);
+  });
+
+  it("matches errors by code even when the message is empty", () => {
+    expect(isTarEofRaceError(Object.assign(new Error(""), { code: "EOF" }))).toBe(true);
+  });
+
+  it.each([
+    "EOF occurred in violation of protocol",
+    "unexpected eof while reading",
+    "ran out of EOF markers",
+    "permission denied",
+    "",
+  ])("does not match unrelated errors: %s", (message) => {
+    expect(isTarEofRaceError(new Error(message))).toBe(false);
+  });
+
+  it("rejects non-object inputs", () => {
+    expect(isTarEofRaceError(null)).toBe(false);
+    expect(isTarEofRaceError(undefined)).toBe(false);
+    expect(isTarEofRaceError("did not encounter expected EOF")).toBe(false);
+  });
+});
+
+describe("writeTarArchiveWithRetry", () => {
+  it("retries on EOF-class errors and eventually succeeds", async () => {
+    const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
+      path: "/state/sessions/s-abc/transcript.jsonl",
+    });
+    const runTar = vi
+      .fn<() => Promise<void>>()
+      .mockRejectedValueOnce(eofErr)
+      .mockRejectedValueOnce(eofErr)
+      .mockResolvedValueOnce(undefined);
+    const log = vi.fn();
+    const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
+
+    await backupCreateInternals.writeTarArchiveWithRetry({
+      tempArchivePath: "/tmp/backup.tar.gz.tmp",
+      runTar,
+      log,
+      sleepMs: sleep,
+    });
+
+    expect(runTar).toHaveBeenCalledTimes(3);
+    expect(sleep).toHaveBeenNthCalledWith(1, 10_000);
+    expect(sleep).toHaveBeenNthCalledWith(2, 20_000);
+    expect(log).toHaveBeenCalledTimes(2);
+  });
+
+  it("surfaces the offending path and attempt count after exhausting retries", async () => {
+    const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
+      path: "/state/logs/gateway.jsonl",
+    });
+    const runTar = vi.fn<() => Promise<void>>().mockRejectedValue(eofErr);
+    const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
+
+    await expect(
+      backupCreateInternals.writeTarArchiveWithRetry({
+        tempArchivePath: "/tmp/backup.tar.gz.tmp",
+        runTar,
+        sleepMs: sleep,
+      }),
+    ).rejects.toThrow(/last offending path: \/state\/logs\/gateway\.jsonl, after 3 attempts/);
+    expect(runTar).toHaveBeenCalledTimes(3);
+  });
+
+  it("lets callers reset per-attempt counters so retries report the final attempt's count, not a running sum", async () => {
+    // Simulate the caller's pattern: a closure counter populated by a filter
+    // that tar.c invokes while walking the tree. Each attempt re-walks the
+    // same tree, so the runTar closure must reset the counter before calling
+    // tar.c -- otherwise the reported count accumulates across attempts.
+    let skippedVolatileCount = 0;
+    const volatileFilesSeenPerAttempt = 5;
+    let attempt = 0;
+
+    const eofErr = Object.assign(new Error("did not encounter expected EOF"), {
+      path: "/state/sessions/s-abc/transcript.jsonl",
+    });
+
+    const runTar = vi.fn<() => Promise<void>>().mockImplementation(async () => {
+      attempt += 1;
+      skippedVolatileCount = 0;
+      for (let i = 0; i < volatileFilesSeenPerAttempt; i += 1) {
+        skippedVolatileCount += 1;
+      }
+      if (attempt < 3) {
+        throw eofErr;
+      }
+    });
+    const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
+
+    await backupCreateInternals.writeTarArchiveWithRetry({
+      tempArchivePath: "/tmp/backup.tar.gz.tmp",
+      runTar,
+      sleepMs: sleep,
+    });
+
+    expect(runTar).toHaveBeenCalledTimes(3);
+    // Without the reset, this would be 15 (5 * 3 attempts). With the reset,
+    // it equals the count from the final (successful) attempt.
+    expect(skippedVolatileCount).toBe(volatileFilesSeenPerAttempt);
+  });
+
+  it("does not retry on non-EOF errors", async () => {
+    const runTar = vi.fn<() => Promise<void>>().mockRejectedValue(new Error("permission denied"));
+    const sleep = vi.fn<(ms: number) => Promise<void>>().mockResolvedValue(undefined);
+
+    await expect(
+      backupCreateInternals.writeTarArchiveWithRetry({
+        tempArchivePath: "/tmp/backup.tar.gz.tmp",
+        runTar,
+        sleepMs: sleep,
+      }),
+    ).rejects.toThrow(/permission denied/);
+    expect(runTar).toHaveBeenCalledTimes(1);
+    expect(sleep).not.toHaveBeenCalled();
+  });
 });

 describe("buildExtensionsNodeModulesFilter", () => {
@@ -131,6 +286,65 @@ describe("buildExtensionsNodeModulesFilter", () => {
 });

 describe("createBackupArchive", () => {
+  it("skips current live volatile state files while preserving workspace locks", async () => {
+    await withOpenClawTestState(
+      {
+        layout: "split",
+        prefix: "openclaw-backup-volatile-",
+        scenario: "minimal",
+      },
+      async (state) => {
+        const outputDir = state.path("backups");
+        await state.writeConfig({
+          agents: {
+            list: [{ id: "main", default: true, workspace: state.workspaceDir }],
+          },
+        });
+        await fs.mkdir(outputDir, { recursive: true });
+        await fs.writeFile(path.join(state.workspaceDir, "Cargo.lock"), "workspace lock\n", "utf8");
+        await fs.writeFile(
+          path.join(state.workspaceDir, "pending.tmp"),
+          "workspace temp fixture\n",
+          "utf8",
+        );
+        await state.writeText("agents/main/sessions/live-session.jsonl", "session\n");
+        await state.writeText("sessions/legacy-session.jsonl", "legacy session\n");
+        await state.writeText("cron/runs/nightly.jsonl", "cron\n");
+        await state.writeText("logs/gateway.log", "log\n");
+        await state.writeJson("delivery-queue/message.json", { id: "delivery" });
+        await state.writeJson("session-delivery-queue/message.json", { id: "session-delivery" });
+        await state.writeText("tmp/staged.tmp", "tmp\n");
+        await state.writeText("gateway.pid", "123\n");
+
+        const result = await createBackupArchive({
+          output: outputDir,
+          includeWorkspace: true,
+          nowMs: Date.UTC(2026, 4, 9, 8, 0, 0),
+        });
+        const entries = await listArchiveEntries(result.archivePath);
+
+        expect(entries.some((entry) => entry.endsWith("/workspace/Cargo.lock"))).toBe(true);
+        expect(entries.some((entry) => entry.endsWith("/workspace/pending.tmp"))).toBe(true);
+        for (const suffix of [
+          "/state/agents/main/sessions/live-session.jsonl",
+          "/state/sessions/legacy-session.jsonl",
+          "/state/cron/runs/nightly.jsonl",
+          "/state/logs/gateway.log",
+          "/state/delivery-queue/message.json",
+          "/state/session-delivery-queue/message.json",
+          "/state/tmp/staged.tmp",
+          "/state/gateway.pid",
+        ]) {
+          expect(
+            entries.some((entry) => entry.endsWith(suffix)),
+            suffix,
+          ).toBe(false);
+        }
+        expect(result.skippedVolatileCount).toBe(8);
+      },
+    );
+  });
+
  it("omits installed plugin node_modules from the real archive while keeping plugin files", async () => {
    await withOpenClawTestState(
      {
--- a/src/infra/backup-create.ts
+++ b/src/infra/backup-create.ts
@@ -13,6 +13,7 @@ import {
 import { isPathWithin } from "../commands/cleanup-utils.js";
 import { resolveHomeDir, resolveUserPath } from "../utils.js";
 import { resolveRuntimeServiceVersion } from "../version.js";
+import { isVolatileBackupPath } from "./backup-volatile-filter.js";
 import { writeJson } from "./json-files.js";

 type TarRuntime = typeof import("tar");
@@ -32,6 +33,12 @@ export type BackupCreateOptions = {
  verify?: boolean;
  json?: boolean;
  nowMs?: number;
+  /**
+   * Optional info logger invoked for non-fatal backup events such as tar
+   * retry notices or volatile-file skip counts. When omitted, events are
+   * silent aside from the final result.
+   */
+  log?: (message: string) => void;
 };

 type BackupManifestAsset = {
@@ -82,8 +89,91 @@ export type BackupCreateResult = {
    reason: string;
    coveredBy?: string;
  }>;
+  /**
+   * Count of files the archiver actively skipped because they matched the
+   * known-volatile filter (live sessions, cron logs, queues, sockets, pid/tmp).
+   * Populated on real writes only; dry runs report 0.
+   */
+  skippedVolatileCount: number;
 };

+const BACKUP_TAR_MAX_ATTEMPTS = 3;
+// Backoff between attempts: wait 10s before attempt 2, 20s before attempt 3.
+const BACKUP_TAR_BACKOFF_MS = [10_000, 20_000];
+
+function isTarEofRaceError(err: unknown): boolean {
+  if (!err || typeof err !== "object") {
+    return false;
+  }
+  const code = (err as NodeJS.ErrnoException).code;
+  if (code === "EOF") {
+    return true;
+  }
+  // Keep this regex narrow: match only the two tar-specific EOF-class error
+  // strings thrown by node-tar's WriteEntry#onread (grow and shrink races,
+  // see node_modules/tar/dist/commonjs/write-entry.js around the
+  // "did not encounter expected EOF" and "encountered unexpected EOF"
+  // Object.assign sites), plus the TAR_BAD_ARCHIVE code surfaced by the
+  // parser on truncated input. A bare /EOF/i alternative also matched
+  // unrelated SSL/OpenSSL strings like "EOF occurred in violation of
+  // protocol" and "unexpected eof while reading", causing pointless retries.
+  const message = (err as Error).message ?? "";
+  return /(did not encounter expected|encountered unexpected) EOF|TAR_BAD_ARCHIVE/i.test(message);
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+export type BackupTarRetryLogger = (message: string) => void;
+
+async function writeTarArchiveWithRetry(params: {
+  tempArchivePath: string;
+  runTar: () => Promise<void>;
+  log?: BackupTarRetryLogger;
+  sleepMs?: (ms: number) => Promise<void>;
+}): Promise<void> {
+  const sleepFn = params.sleepMs ?? sleep;
+  let lastErr: unknown;
+  for (let attempt = 1; attempt <= BACKUP_TAR_MAX_ATTEMPTS; attempt += 1) {
+    try {
+      await params.runTar();
+      return;
+    } catch (err) {
+      lastErr = err;
+      if (!isTarEofRaceError(err) || attempt === BACKUP_TAR_MAX_ATTEMPTS) {
+        break;
+      }
+      try {
+        await fs.rm(params.tempArchivePath, { force: true });
+      } catch (cleanupErr) {
+        const code = (cleanupErr as NodeJS.ErrnoException).code;
+        if (code && code !== "ENOENT") {
+          params.log?.(
+            `Backup archiver could not remove temp archive ${params.tempArchivePath} between retries: ${code}. Continuing.`,
+          );
+        }
+      }
+      const backoff = BACKUP_TAR_BACKOFF_MS[attempt - 1] ?? 0;
+      const offendingPath = (err as NodeJS.ErrnoException).path;
+      params.log?.(
+        `Backup archiver hit a live-write race${
+          offendingPath ? ` on ${offendingPath}` : ""
+        } (attempt ${attempt}/${BACKUP_TAR_MAX_ATTEMPTS}); retrying in ${Math.round(backoff / 1000)}s.`,
+      );
+      await sleepFn(backoff);
+    }
+  }
+  const final = lastErr instanceof Error ? lastErr : new Error(String(lastErr));
+  const offendingPath = (lastErr as NodeJS.ErrnoException | undefined)?.path;
+  const suffix = offendingPath
+    ? ` (last offending path: ${offendingPath}, after ${BACKUP_TAR_MAX_ATTEMPTS} attempts)`
+    : ` (after ${BACKUP_TAR_MAX_ATTEMPTS} attempts)`;
+  throw new Error(`Backup archive write failed: ${final.message}${suffix}`, { cause: final });
+}
+
+export const __test = { writeTarArchiveWithRetry, isTarEofRaceError };
+
 async function resolveOutputPath(params: {
  output?: string;
  nowMs: number;
@@ -259,6 +349,13 @@ export function formatBackupCreateSummary(result: BackupCreateResult): string[]
    lines.push("Dry run only; archive was not written.");
  } else {
    lines.push(`Created ${result.archivePath}`);
+    if (result.skippedVolatileCount > 0) {
+      lines.push(
+        `Skipped ${result.skippedVolatileCount} volatile file${
+          result.skippedVolatileCount === 1 ? "" : "s"
+        } (live sessions, cron logs, queues, sockets, pid/tmp).`,
+      );
+    }
    if (result.verified) {
      lines.push("Archive verification: passed");
    }
@@ -344,6 +441,7 @@ export async function createBackupArchive(
    verified: false,
    assets: plan.included,
    skipped: plan.skipped,
+    skippedVolatileCount: 0,
  };

  if (opts.dryRun) {
@@ -371,24 +469,61 @@ export async function createBackupArchive(

    const tar = await loadTarRuntime();
    const stateAsset = result.assets.find((asset) => asset.kind === "state");
-    const filter = stateAsset ? buildExtensionsNodeModulesFilter(stateAsset.sourcePath) : undefined;
-    await tar.c(
-      {
-        file: tempArchivePath,
-        ...(filter ? { filter } : {}),
-        gzip: true,
-        portable: true,
-        preservePaths: true,
-        onWriteEntry: (entry) => {
-          entry.path = remapArchiveEntryPath({
-            entryPath: entry.path,
-            manifestPath,
-            archiveRoot,
-          });
-        },
+    const extensionsFilter = stateAsset
+      ? buildExtensionsNodeModulesFilter(stateAsset.sourcePath)
+      : undefined;
+    const volatilePlan = { stateDirs: [stateAsset?.sourcePath ?? plan.stateDir] };
+    let skippedVolatileCount = 0;
+    const tarFilter = (entryPath: string): boolean => {
+      // The manifest is staged in a tmp dir outside any state directory and
+      // is always safe to include.
+      if (path.resolve(entryPath) === manifestPath) {
+        return true;
+      }
+      if (extensionsFilter && !extensionsFilter(entryPath)) {
+        return false;
+      }
+      if (isVolatileBackupPath(entryPath, volatilePlan)) {
+        skippedVolatileCount += 1;
+        return false;
+      }
+      return true;
+    };
+    await writeTarArchiveWithRetry({
+      tempArchivePath,
+      log: opts.log,
+      runTar: () => {
+        // tar.c re-walks the tree (and thus re-invokes tarFilter) on every
+        // attempt, so reset the closure counter here or retries would report
+        // cumulative skip counts across attempts instead of the final one.
+        skippedVolatileCount = 0;
+        return tar.c(
+          {
+            file: tempArchivePath,
+            gzip: true,
+            portable: true,
+            preservePaths: true,
+            filter: tarFilter,
+            onWriteEntry: (entry) => {
+              entry.path = remapArchiveEntryPath({
+                entryPath: entry.path,
+                manifestPath,
+                archiveRoot,
+              });
+            },
+          },
+          [manifestPath, ...result.assets.map((asset) => asset.sourcePath)],
+        );
      },
-      [manifestPath, ...result.assets.map((asset) => asset.sourcePath)],
-    );
+    });
+    result.skippedVolatileCount = skippedVolatileCount;
+    if (skippedVolatileCount > 0) {
+      opts.log?.(
+        `Backup skipped ${skippedVolatileCount} volatile file${
+          skippedVolatileCount === 1 ? "" : "s"
+        } (live sessions, cron logs, queues, sockets, pid/tmp).`,
+      );
+    }
    await publishTempArchive({ tempArchivePath, outputPath });
  } finally {
    await fs.rm(tempArchivePath, { force: true }).catch(() => undefined);
--- a/src/infra/backup-volatile-filter.test.ts
+++ b/src/infra/backup-volatile-filter.test.ts
@@ -0,0 +1,120 @@
+import { describe, expect, it } from "vitest";
+import { isVolatileBackupPath } from "./backup-volatile-filter.js";
+
+const stateDir = "/opt/openclaw/state";
+const plan = { stateDirs: [stateDir] };
+
+describe("isVolatileBackupPath", () => {
+  it.each([
+    // volatile: session transcripts
+    [`${stateDir}/sessions/s-abc/transcript.jsonl`, true],
+    [`${stateDir}/sessions/s-abc/run.log`, true],
+    [`${stateDir}/agents/main/sessions/transcript.jsonl`, true],
+    [`${stateDir}/agents/ops/sessions/run.log`, true],
+    // volatile: cron run logs
+    [`${stateDir}/cron/runs/2026-01-01/job.log`, true],
+    [`${stateDir}/cron/runs/nightly.jsonl`, true],
+    // volatile: generic state logs
+    [`${stateDir}/logs/gateway.jsonl`, true],
+    [`${stateDir}/logs/nested/gateway.log`, true],
+    // volatile: sockets/pids/tmp under state
+    [`${stateDir}/ipc/gateway.sock`, true],
+    [`${stateDir}/gateway.pid`, true],
+    [`${stateDir}/tmp/pending.tmp`, true],
+    [`${stateDir}/delivery-queue/pending.tmp`, true],
+    [`${stateDir}/session-delivery-queue/pending.tmp`, true],
+
+    // non-volatile: session config, not jsonl/log
+    [`${stateDir}/sessions/s-abc/meta.json`, false],
+    [`${stateDir}/agents/main/sessions/sessions.json`, false],
+    // non-volatile: cron definitions
+    [`${stateDir}/cron/jobs.json`, false],
+    // non-volatile: cron runs but wrong extension
+    [`${stateDir}/cron/runs/2026-01-01/job.json`, false],
+    // non-volatile: plain config
+    [`${stateDir}/config.json`, false],
+    // non-volatile: workspace files outside state
+    ["/home/user/project/README.md", false],
+    ["/home/user/project/Cargo.lock", false],
+    ["/home/user/project/pending.tmp", false],
+    // non-volatile: log-like name outside scope
+    ["/home/user/notes/daily.log", false],
+  ])("classifies %s as volatile=%s", (p, expected) => {
+    expect(isVolatileBackupPath(p, plan)).toBe(expected);
+  });
+
+  it("returns false when no state dirs are provided", () => {
+    expect(
+      isVolatileBackupPath(`${stateDir}/sessions/s-abc/transcript.jsonl`, { stateDirs: [] }),
+    ).toBe(false);
+  });
+
+  it("does not skip transient extensions without a state anchor", () => {
+    expect(isVolatileBackupPath("/any/path/daemon.sock", { stateDirs: [] })).toBe(false);
+    expect(isVolatileBackupPath("/any/path/daemon.pid", { stateDirs: [] })).toBe(false);
+    expect(isVolatileBackupPath("/any/path/Cargo.lock", { stateDirs: [] })).toBe(false);
+  });
+
+  it("does not match paths that escape the anchor via `..`", () => {
+    // `/opt/openclaw/state/sessions/../config.jsonl` resolves to
+    // `/opt/openclaw/state/config.jsonl`, which is NOT inside sessions/.
+    expect(isVolatileBackupPath(`${stateDir}/sessions/../config.jsonl`, plan)).toBe(false);
+    expect(isVolatileBackupPath(`${stateDir}/cron/runs/../jobs.log`, plan)).toBe(false);
+    expect(isVolatileBackupPath(`${stateDir}/logs/../notes.jsonl`, plan)).toBe(false);
+  });
+
+  it("treats delivery-queue json files under stateDir as volatile", () => {
+    expect(
+      isVolatileBackupPath(
+        `${stateDir}/delivery-queue/3fac5e46-42dc-4230-a725-51c203830b4f.json`,
+        plan,
+      ),
+    ).toBe(true);
+  });
+
+  it("treats nested delivery-queue json files under stateDir as volatile", () => {
+    expect(
+      isVolatileBackupPath(
+        `${stateDir}/delivery-queue/subdir/3fac5e46-42dc-4230-a725-51c203830b4f.json`,
+        plan,
+      ),
+    ).toBe(true);
+  });
+
+  it("does not treat non-json delivery-queue files as volatile", () => {
+    expect(isVolatileBackupPath(`${stateDir}/delivery-queue/README.md`, plan)).toBe(false);
+  });
+
+  it("does not treat delivery-queue json outside stateDir as volatile", () => {
+    expect(isVolatileBackupPath(`/tmp/delivery-queue/file.json`, plan)).toBe(false);
+  });
+
+  it("normalizes Windows-style separators before anchor checks", () => {
+    const winStateDir = "C:\\openclaw\\state";
+    const winPlan = { stateDirs: [winStateDir] };
+    expect(isVolatileBackupPath(`${winStateDir}\\sessions\\s-abc\\transcript.jsonl`, winPlan)).toBe(
+      true,
+    );
+    expect(isVolatileBackupPath(`${winStateDir}\\agents\\main\\sessions\\s.jsonl`, winPlan)).toBe(
+      true,
+    );
+    expect(isVolatileBackupPath(`${winStateDir}\\cron\\runs\\2026\\job.jsonl`, winPlan)).toBe(true);
+    // `..` escape via backslashes must also be rejected.
+    expect(isVolatileBackupPath(`${winStateDir}\\sessions\\..\\config.jsonl`, winPlan)).toBe(false);
+  });
+
+  it("matches tar filter paths when node-tar omits the leading slash", () => {
+    expect(
+      isVolatileBackupPath("opt/openclaw/state/agents/main/sessions/transcript.jsonl", plan),
+    ).toBe(true);
+  });
+
+  it("treats session-delivery-queue json files under stateDir as volatile", () => {
+    expect(
+      isVolatileBackupPath(
+        `${stateDir}/session-delivery-queue/3fac5e46-42dc-4230-a725-51c203830b4f.json`,
+        plan,
+      ),
+    ).toBe(true);
+  });
+});
--- a/src/infra/backup-volatile-filter.ts
+++ b/src/infra/backup-volatile-filter.ts
@@ -0,0 +1,131 @@
+import path from "node:path";
+
+/**
+ * Paths that are known to change during a live backup and commonly trigger
+ * tar EOF errors. These files are actively appended to (logs, sockets, pid
+ * markers) while `tar.c()` is reading them, which races with the size recorded
+ * at `lstat()` time.
+ *
+ * Skipping them is safe: they are either recreated on startup, are transient
+ * by nature, or have durable equivalents elsewhere in state. Snapshotting a
+ * partial tail of a live log has no restoration value.
+ */
+
+const STATE_TRANSIENT_EXTENSIONS = new Set([".sock", ".pid", ".tmp"]);
+
+function normalizePosix(input: string): string {
+  if (!input) {
+    return input;
+  }
+  // Swap Windows-style separators, then collapse `.`/`..` segments so ancestry
+  // checks cannot be bypassed by a path that traverses out of the anchor.
+  return path.posix.normalize(input.replaceAll("\\", "/"));
+}
+
+function isUnder(childPosix: string, parentPosix: string): boolean {
+  if (!parentPosix) {
+    return false;
+  }
+  const p = parentPosix.endsWith("/") ? parentPosix : `${parentPosix}/`;
+  return childPosix === parentPosix || childPosix.startsWith(p);
+}
+
+function hasExtension(filePosix: string, extensions: readonly string[]): boolean {
+  const ext = path.posix.extname(filePosix).toLowerCase();
+  return extensions.includes(ext);
+}
+
+function hasExtensionInSet(filePosix: string, extensions: ReadonlySet<string>): boolean {
+  return extensions.has(path.posix.extname(filePosix).toLowerCase());
+}
+
+function isAgentSessionTranscriptPath(filePosix: string, stateDirPosix: string): boolean {
+  const agentsRoot = path.posix.join(stateDirPosix, "agents");
+  if (!isUnder(filePosix, agentsRoot)) {
+    return false;
+  }
+  const relative = path.posix.relative(agentsRoot, filePosix);
+  const parts = relative.split("/").filter(Boolean);
+  return parts.length >= 3 && parts[1] === "sessions";
+}
+
+function filePathCandidates(input: string): string[] {
+  const normalized = normalizePosix(input);
+  if (normalized.startsWith("/") || /^[A-Za-z]:\//u.test(normalized)) {
+    return [normalized];
+  }
+  // node-tar may pass absolute input paths to filters without the leading
+  // slash, even when the source list used absolute paths.
+  return [normalized, normalizePosix(`/${normalized}`)];
+}
+
+export type VolatileFilterPlan = {
+  /** Canonical state directories the filter should treat as volatile anchors. */
+  stateDirs: string[];
+};
+
+/**
+ * Returns true if the given absolute path should be skipped during backup
+ * because it is a live-mutation target.
+ *
+ * Rules:
+ *   - `{stateDir}/sessions/**`/`*.{jsonl,log}` (legacy)
+ *   - `{stateDir}/agents/<agentId>/sessions/**`/`*.{jsonl,log}`
+ *   - `{stateDir}/cron/runs/**`/`*.{jsonl,log}`
+ *   - `{stateDir}/logs/**`/`*.{jsonl,log}`
+ *   - `{stateDir}/{delivery-queue,session-delivery-queue}/**`/`*.{json,tmp}`
+ *   - `{stateDir}/**`/`*.{sock,pid,tmp}`
+ */
+export function isVolatileBackupPath(absolutePath: string, plan: VolatileFilterPlan): boolean {
+  if (!absolutePath) {
+    return false;
+  }
+  const candidates = filePathCandidates(absolutePath);
+
+  for (const stateDir of plan.stateDirs) {
+    if (!stateDir) {
+      continue;
+    }
+    const stateDirPosix = normalizePosix(stateDir);
+
+    for (const filePosix of candidates) {
+      const sessionsRoot = path.posix.join(stateDirPosix, "sessions");
+      if (isUnder(filePosix, sessionsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) {
+        return true;
+      }
+
+      if (
+        isAgentSessionTranscriptPath(filePosix, stateDirPosix) &&
+        hasExtension(filePosix, [".jsonl", ".log"])
+      ) {
+        return true;
+      }
+
+      const cronRunsRoot = path.posix.join(stateDirPosix, "cron", "runs");
+      if (isUnder(filePosix, cronRunsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) {
+        return true;
+      }
+
+      const logsRoot = path.posix.join(stateDirPosix, "logs");
+      if (isUnder(filePosix, logsRoot) && hasExtension(filePosix, [".jsonl", ".log"])) {
+        return true;
+      }
+
+      for (const queueDir of ["delivery-queue", "session-delivery-queue"]) {
+        const queueRoot = path.posix.join(stateDirPosix, queueDir);
+        if (isUnder(filePosix, queueRoot) && hasExtension(filePosix, [".json", ".tmp"])) {
+          return true;
+        }
+      }
+
+      if (
+        isUnder(filePosix, stateDirPosix) &&
+        hasExtensionInSet(filePosix, STATE_TRANSIENT_EXTENSIONS)
+      ) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}