From 2f2bb7dac630e7e653a688f585bda42866d4e40d Mon Sep 17 00:00:00 2001 From: Cedric <86914379+cdznho@users.noreply.github.com> Date: Sat, 2 May 2026 04:07:19 +0100 Subject: [PATCH] fix(agents): reclaim untracked self-owned session locks (#75822) Summary: - The PR refactors session-lock inspection to reclaim untracked current-process locks with matching starttime during acquisition and startup cleanup, adds regression tests, and adds a changelog entry. - Reproducibility: yes. A high-confidence code-level reproduction is to create a fresh `.jsonl.lock` with `pid ... eLock or cleanStaleLockFiles on current main and observe that acquisition waits or cleanup leaves the lock. ClawSweeper fixups: - Included follow-up commit: docs: add session lock changelog entry - Included follow-up commit: refactor(agents): distill session lock reclaim policy Validation: - ClawSweeper review passed for head 2eae2c93b142dfad2ef39124cf235fde8f1cc7cb. - Required merge gates passed before the squash merge. Prepared head SHA: 2eae2c93b142dfad2ef39124cf235fde8f1cc7cb Review: https://github.com/openclaw/openclaw/pull/75822#issuecomment-4361741599 Co-authored-by: Cedric <86914379+cdznho@users.noreply.github.com> Co-authored-by: Ayaan Zaidi --- CHANGELOG.md | 1 + src/agents/session-write-lock.test.ts | 49 +++++++++++++++ src/agents/session-write-lock.ts | 87 ++++++++++++++++++++------- 3 files changed, 114 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 582cc46507f..bc1c922326a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai - Slack/setup: print the generated app manifest as plain JSON instead of embedding it inside the framed setup note, so it can be copied into Slack without deleting border characters. Fixes #65751. Thanks @theDanielJLewis. - Channels/WhatsApp: route CLI logout through the live Gateway and stop runtime-backed listeners before channel removal, so removing a WhatsApp account does not leave the old socket replying until restart. Fixes #67746. Thanks @123Mismail. - Voice Call/Twilio: honor TTS directive text and provider voice/model overrides during telephony synthesis, so `[[tts:...]]` tags are not spoken literally and voiceId overrides reach OpenAI/ElevenLabs calls. Fixes #58114. Thanks @legonhilltech-jpg. +- Agents/session-locks: reclaim untracked current-process session locks with matching starttime during acquisition and startup cleanup, so Gateway restarts recover from self-owned orphan `.jsonl.lock` files. Fixes #75805; refs #49603. Thanks @cdznho. - Agents/subagents: initialize built-in context engines before native `sessions_spawn` resolves spawn preparation, so cliBackend-only cold starts no longer fail with an unregistered `legacy` context engine. Fixes #73095. (#73904) Thanks @brokemac79. - Agents/Codex: stop prompting message-tool-only source turns to finish with `NO_REPLY`, so quiet turns are represented by not calling the visible message tool instead of conflicting final-text instructions. Thanks @pashpashpash. - Gateway/config: report failed backup restores as failed in logs and config observe audit records instead of marking them valid. (#70515) Thanks @davidangularme. diff --git a/src/agents/session-write-lock.test.ts b/src/agents/session-write-lock.test.ts index 019fbddcb37..b56e098b799 100644 --- a/src/agents/session-write-lock.test.ts +++ b/src/agents/session-write-lock.test.ts @@ -401,6 +401,43 @@ describe("acquireSessionWriteLock", () => { } }); + it("cleans untracked current-process .jsonl lock files with matching starttime", async () => { + const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-")); + const sessionsDir = path.join(root, "sessions"); + await fs.mkdir(sessionsDir, { recursive: true }); + + const nowMs = Date.now(); + const orphanSelfLock = path.join(sessionsDir, "orphan-self.jsonl.lock"); + + try { + await fs.writeFile( + orphanSelfLock, + JSON.stringify({ + pid: process.pid, + createdAt: new Date(nowMs).toISOString(), + starttime: FAKE_STARTTIME, + }), + "utf8", + ); + + const result = await cleanStaleLockFiles({ + sessionsDir, + staleMs: 30_000, + nowMs, + removeStale: true, + }); + + expect(result.locks).toHaveLength(1); + expect(result.cleaned.map((entry) => path.basename(entry.lockPath))).toEqual([ + "orphan-self.jsonl.lock", + ]); + expect(result.cleaned[0]?.staleReasons).toContain("orphan-self-pid"); + await expect(fs.access(orphanSelfLock)).rejects.toThrow(); + } finally { + await fs.rm(root, { recursive: true, force: true }); + } + }); + it("removes held locks on termination signals", async () => { const signals = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const; const originalKill = process.kill.bind(process); @@ -456,6 +493,14 @@ describe("acquireSessionWriteLock", () => { }); }); + it("reclaims untracked current-process lock files with matching starttime", async () => { + await withTempSessionLockFile(async ({ sessionFile, lockPath }) => { + await writeCurrentProcessLock(lockPath, { starttime: FAKE_STARTTIME }); + + await expectCurrentPidOwnsLock({ sessionFile, timeoutMs: 500 }); + }); + }); + it("does not reclaim active in-process lock files without starttime", async () => { await expectActiveInProcessLockIsNotReclaimed(); }); @@ -464,6 +509,10 @@ describe("acquireSessionWriteLock", () => { await expectActiveInProcessLockIsNotReclaimed({ legacyStarttime: 123.5 }); }); + it("does not reclaim active in-process lock files with matching starttime", async () => { + await expectActiveInProcessLockIsNotReclaimed({ legacyStarttime: FAKE_STARTTIME }); + }); + it("registers cleanup for SIGQUIT and SIGABRT", () => { expect(__testing.cleanupSignals).toContain("SIGQUIT"); expect(__testing.cleanupSignals).toContain("SIGABRT"); diff --git a/src/agents/session-write-lock.ts b/src/agents/session-write-lock.ts index c318243e903..10298681246 100644 --- a/src/agents/session-write-lock.ts +++ b/src/agents/session-write-lock.ts @@ -348,6 +348,17 @@ async function readLockPayload(lockPath: string): Promise { + const resolvedSessionFile = path.resolve(sessionFile); + const sessionDir = path.dirname(resolvedSessionFile); + try { + const normalizedDir = await fs.realpath(sessionDir); + return path.join(normalizedDir, path.basename(resolvedSessionFile)); + } catch { + return resolvedSessionFile; + } +} + function inspectLockPayload( payload: LockFilePayload | null, staleMs: number, @@ -429,16 +440,51 @@ async function shouldReclaimContendedLockFile( function shouldTreatAsOrphanSelfLock(params: { payload: LockFilePayload | null; normalizedSessionFile: string; + reclaimLockWithoutStarttime: boolean; }): boolean { const pid = isValidLockNumber(params.payload?.pid) ? params.payload.pid : null; if (pid !== process.pid) { return false; } - const hasValidStarttime = isValidLockNumber(params.payload?.starttime); - if (hasValidStarttime) { + if (HELD_LOCKS.has(params.normalizedSessionFile)) { return false; } - return !HELD_LOCKS.has(params.normalizedSessionFile); + + const storedStarttime = isValidLockNumber(params.payload?.starttime) + ? params.payload.starttime + : null; + if (storedStarttime === null) { + return params.reclaimLockWithoutStarttime; + } + + const currentStarttime = getProcessStartTime(process.pid); + return currentStarttime !== null && currentStarttime === storedStarttime; +} + +function inspectLockPayloadForSession(params: { + payload: LockFilePayload | null; + staleMs: number; + nowMs: number; + normalizedSessionFile: string; + reclaimLockWithoutStarttime: boolean; +}): LockInspectionDetails { + const inspected = inspectLockPayload(params.payload, params.staleMs, params.nowMs); + if ( + !shouldTreatAsOrphanSelfLock({ + payload: params.payload, + normalizedSessionFile: params.normalizedSessionFile, + reclaimLockWithoutStarttime: params.reclaimLockWithoutStarttime, + }) + ) { + return inspected; + } + return { + ...inspected, + stale: true, + staleReasons: inspected.staleReasons.includes("orphan-self-pid") + ? inspected.staleReasons + : [...inspected.staleReasons, "orphan-self-pid"], + }; } export async function cleanStaleLockFiles(params: { @@ -476,7 +522,15 @@ export async function cleanStaleLockFiles(params: { for (const entry of lockEntries) { const lockPath = path.join(sessionsDir, entry.name); const payload = await readLockPayload(lockPath); - const inspected = inspectLockPayload(payload, staleMs, nowMs); + const sessionFile = lockPath.slice(0, -".lock".length); + const normalizedSessionFile = await resolveNormalizedSessionFile(sessionFile); + const inspected = inspectLockPayloadForSession({ + payload, + staleMs, + nowMs, + normalizedSessionFile, + reclaimLockWithoutStarttime: false, + }); const lockInfo: SessionLockInspection = { lockPath, ...inspected, @@ -515,13 +569,7 @@ export async function acquireSessionWriteLock(params: { const sessionFile = path.resolve(params.sessionFile); const sessionDir = path.dirname(sessionFile); await fs.mkdir(sessionDir, { recursive: true }); - let normalizedDir = sessionDir; - try { - normalizedDir = await fs.realpath(sessionDir); - } catch { - // Fall back to the resolved path if realpath fails (permissions, transient FS). - } - const normalizedSessionFile = path.join(normalizedDir, path.basename(sessionFile)); + const normalizedSessionFile = await resolveNormalizedSessionFile(sessionFile); const lockPath = `${normalizedSessionFile}.lock`; const held = HELD_LOCKS.get(normalizedSessionFile); @@ -587,21 +635,14 @@ export async function acquireSessionWriteLock(params: { } const payload = await readLockPayload(lockPath); const nowMs = Date.now(); - const inspected = inspectLockPayload(payload, staleMs, nowMs); - const orphanSelfLock = shouldTreatAsOrphanSelfLock({ + const inspected = inspectLockPayloadForSession({ payload, + staleMs, + nowMs, normalizedSessionFile, + reclaimLockWithoutStarttime: true, }); - const reclaimDetails = orphanSelfLock - ? { - ...inspected, - stale: true, - staleReasons: inspected.staleReasons.includes("orphan-self-pid") - ? inspected.staleReasons - : [...inspected.staleReasons, "orphan-self-pid"], - } - : inspected; - if (await shouldReclaimContendedLockFile(lockPath, reclaimDetails, staleMs, nowMs)) { + if (await shouldReclaimContendedLockFile(lockPath, inspected, staleMs, nowMs)) { await fs.rm(lockPath, { force: true }); continue; }